-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfolhas_spider.py
68 lines (54 loc) · 1.93 KB
/
folhas_spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
import pandas as pd
class FolhaSpider(CrawlSpider):
name = "folha"
start_urls = ["https://www1.folha.uol.com.br/ultimas-noticias/"]
useful_links = []
data_collected = pd.DataFrame()
rules = [
Rule(LinkExtractor(allow=".shtml"), callback="get_links",
follow=False),
]
def __find_date(self, lista):
for item in lista:
if " às " in item:
print(item)
def get_links(self, response):
replace_dict = {
"\n": "",
"\t": "",
"Assinantes podem liberar 5 acessos por dia para conteúdos da Folha":
"",
"Gostaria de receber as principais notícias": "",
" do Brasil e do mundo?": "",
}
title = str(response).split("/")[-1].replace(".shtml>",
"").replace("-", " ")
paragraphs = response.xpath('//*//p/text()').extract()
date = response.xpath('//*/time').extract()
self.__find_date(date)
print(title)
# print(date)
return
text = ''.join(paragraphs)
for key, value in replace_dict.items():
text = text.replace(key, value)
# editor = Editado por Fábio Zanini (interino),
print(text)
# paragraph = paragraph[0:last_item]
# print(paragraph)
# def get_paragraphs(self, response):
# //*[@id="conteudo"]/div[3]/p[5]/text()
import logging
logging.getLogger('scrapy').propagate = False
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
# save in file CSV, JSON or XML
'FEED_FORMAT': 'csv', # csv, json, xml
'FEED_URI': 'output.csv', #
})
c.crawl(FolhaSpider, LOG_ENABLED=False)
c.start()