-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path.old.main.py
139 lines (108 loc) · 3.95 KB
/
.old.main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import os
import shutil
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup
from docx import Document
from docx.enum.text import WD_BREAK
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.keys import Keys
from bcolors import bcolors
from os.path import basename
from ebook import makeEpub
from ebooklib import epub
headers = {
"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/90.0.4430.212 Safari/537.36'}
print(bcolors.LightMagenta + "Podaj link do pierwszego rozdziału: " + bcolors.ResetAll)
startURL = input()
startURL.strip()
URL = startURL
print(bcolors.Green + "Przygotowywanie" + bcolors.ResetAll)
try:
browser = webdriver.Firefox()
browser.get(URL)
html_source = browser.page_source
browser.quit()
except:
print(bcolors.Red + "Złe URL!" + bcolors.ResetAll)
browser.quit()
exit()
soup = BeautifulSoup(html_source, 'html.parser')
print(bcolors.Green + "Zakończono przygotowania" + bcolors.ResetAll)
book_title = soup.find(class_="title h5").get_text().strip()
print(bcolors.Green + "Tytuł książki:", book_title + bcolors.ResetAll)
document = Document()
print(bcolors.Green + "Utworzono nowy plik docx" + bcolors.ResetAll)
document.add_heading(book_title, 0)
def getChapterTitle(soup):
title_div = soup.find(class_="row part-header")
title = title_div.find("h1", class_="h2").get_text().strip()
return title
def getText(soup):
ret = ""
contents = soup.find_all("div", class_="page")
for content in contents:
out = content.find(class_="panel").get_text()
ret += out
return ret.strip()
def writeTextDoc(title, text, document):
p = document.add_heading(title, level=1)
run = p.add_run()
run.add_break()
p = document.add_paragraph(text)
run = p.add_run()
run.add_break(WD_BREAK.PAGE)
def generateChapters(title, text):
c1 = epub.EpubHtml(title=title)
c1.content = '<html><head></head><body><h1>'+title+'</h1><p>'+text+'</p></body></html>'
return c1
def nextURL(soup):
try:
next_url_is = soup.find(class_="on-navigate next-part-link")["href"]
except:
next_url_is = ""
return next_url_is
def get_images(soup, url):
images = [img for img in soup.findAll('img', class_="cover")]
print(str(len(images)) + " images found.")
print('Downloading cover photo.')
image_links = [each.get('src') for each in images]
for each in image_links:
print(f"{each}: ")
try:
filename = each.strip().split('/')[-1].strip()
src = urljoin(url, each)
print('Getting: ' + filename)
response = requests.get(src, stream=True)
# delay to avoid corrupted previews
# time.sleep(1)
with open(filename, 'wb') as out_file:
shutil.copyfileobj(response.raw, out_file)
except:
print('An error occured. Continuing.')
print('Done.')
return filename
while not URL == "":
for span in soup.find_all("span", {'class': 'comment-marker'}):
span.decompose()
for br in soup.find_all("br"):
br.replace_with("\n")
title = getChapterTitle(soup)
text = getText(soup)
print(bcolors.Green + "Dodawanie:", title + bcolors.ResetAll)
writeTextDoc(title, text, document)
URL = nextURL(soup)
if URL == "":
break
print(bcolors.LightGreen + "Dodano:", title + bcolors.ResetAll)
page = requests.get(URL, headers=headers)
soup = BeautifulSoup(page.content, 'html.parser')
file_name = "".join(x for x in book_title if x.isalnum())
print(bcolors.LightMagenta + "Nazwa pliku (" + file_name + "):" + bcolors.ResetAll)
file_name_user = input()
if not file_name_user == "":
file_name = file_name_user
document.save(file_name + '.docx')
print(bcolors.Green + "Książkę zapisano pod nazwą", file_name + bcolors.ResetAll)