-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathapp.py
124 lines (75 loc) · 2.92 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
from flask import Flask, render_template, url_for, request, jsonify, make_response, flash, redirect
from flask_restful import Api, Resource
import time
import csv
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options
import requests
import re
import os
from selenium import webdriver
from bs4 import BeautifulSoup
app = Flask(__name__)
api = Api(app)
options = Options()
options.add_argument('--headless')
options.add_argument('--profile-directory=Default')
ch_path = os.getcwd() + "/chromedriver"
driver = webdriver.Chrome(options=options,executable_path=ch_path)
# driver = webdriver.Chrome("chromedriver.exe")
# poet_link = "https://www.rekhta.org/poets/mirza-ghalib/ghazals"
def download_html(driver, poet_link):
driver.get(poet_link)
### SCROLLING SHUGAL START HERE!
SCROLL_PAUSE_TIME = 0.5
## Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
time.sleep(SCROLL_PAUSE_TIME)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
file = open("Ghalib.html", 'w')
file.write(driver.page_source)
file.close()
def writing_txt(driver):
CHANGE_TIME = 1.0
page = open('Ghalib.html', 'r')
soup = BeautifulSoup(page, 'html.parser')
page.close()
info = soup.findAll('div', {'class': 'contentListItems nwPoetListBody'})
links = []
for each_business in info:
# Your Fix here
for a in each_business.find_all('a', href=True):
links.append(a['href'])
links_url = list(set(links))
print("Total Ghazals: ", len(links_url))
data = open("dataset.txt", 'w', encoding="utf-8")
for i in range(len(links_url)):
driver.get(links_url[i])
ghazal_content = driver.find_elements_by_xpath('//div[@class="pMC showTranslation"]')
time.sleep(CHANGE_TIME)
if not ghazal_content:
print("Empty")
else:
data.write(ghazal_content[0].text)
data.close()
driver.close()
@app.route('/', methods=['GET', 'POST'])
def index():
if request.method == 'POST':
poet_link = request.form["url"]
download_html(driver, poet_link)
writing_txt(driver)
with open('dataset.txt', 'r', encoding="utf-8") as f:
b_lines = [ row for row in f ]
return render_template('content.html', b_lines = b_lines)
return render_template('index2.html')
if __name__ == "__main__":
app.run(debug=True)