-
-
Notifications
You must be signed in to change notification settings - Fork 415
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
resolve #750
- Loading branch information
Showing
1 changed file
with
234 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,234 @@ | ||
from collections.abc import Generator | ||
from datetime import date, datetime | ||
|
||
from parsel import Selector | ||
from scrapy.http import FormRequest, Request, TextResponse | ||
|
||
from gazette.items import Gazette | ||
from gazette.spiders.base import BaseGazetteSpider | ||
|
||
BASE_URL: str = "https://diariooficial.vitoria.es.gov.br/" | ||
|
||
|
||
class EsVitoriaSpider(BaseGazetteSpider): | ||
name: str = "es_vitoria" | ||
TERRITORY_ID: str = "3205309" | ||
start_date: date = date(2014, 7, 21) | ||
|
||
# When there are too many requests, the server may return | ||
# an HTTP 406 status code when trying to download a PDF file | ||
# | ||
# We set `custom_settings` to avoid triggering the 406 HTTP status code | ||
# by spreading the downloads for this spider over time | ||
|
||
custom_settings: dict = { | ||
"DOWNLOAD_DELAY": 0.1, # 100 ms | ||
"RANDOMIZE_DOWNLOAD_DELAY": True, | ||
"RETRY_HTTP_CODES": [500, 502, 503, 504, 522, 524, 408, 429, 406], | ||
} | ||
|
||
def __init__(self, *args, **kwargs) -> None: | ||
super(EsVitoriaSpider, self).__init__(*args, **kwargs) | ||
|
||
self.data_by_monthly_date_by_date: dict[ | ||
tuple[ | ||
int, # year | ||
int, # month | ||
], | ||
dict[ | ||
date, # gazette_date | ||
list[ | ||
str, # url | ||
], | ||
], | ||
] = {} | ||
|
||
def start_requests(self) -> Generator: | ||
url: str = BASE_URL | ||
|
||
today: date = date.today() | ||
year: int = today.year | ||
month: int = today.month | ||
|
||
yield Request( | ||
url=url, | ||
callback=self.initial_parse, | ||
meta={"cookiejar": f"{self.name}_{year}_{month}"}, | ||
) | ||
|
||
def initial_parse(self, response: TextResponse) -> Generator: | ||
year_select: Selector = response.xpath("//select[contains(@id, 'ddlAno')]") | ||
year_formkey: str = year_select.attrib["name"] | ||
years_available: map[int] = map( | ||
int, year_select.xpath("./option/@value").getall() | ||
) | ||
chosen_year: int = int( | ||
year_select.xpath("./option[contains(@selected, 'selected')]/@value").get() | ||
) | ||
|
||
year: int | ||
for year in years_available: | ||
if year < self.start_date.year or self.end_date.year < year: | ||
continue | ||
|
||
if year == chosen_year: | ||
yield from self.parse_year(response, year) | ||
continue | ||
|
||
yield FormRequest.from_response( | ||
response, | ||
formdata={year_formkey: str(year)}, | ||
callback=self.parse_year, | ||
cb_kwargs={"year": year}, | ||
# We are isolating cookiejar per name-year-month combination | ||
# to avoid interference between concurrent requests | ||
# Whenever we request a past year, it sets the month to December | ||
meta={"cookiejar": f"{self.name}_{year}_12"}, | ||
) | ||
|
||
def parse_year(self, response: TextResponse, year: int) -> Generator: | ||
year_select: Selector = response.xpath("//select[contains(@id, 'ddlAno')]") | ||
year_formkey: str = year_select.attrib["name"] | ||
|
||
month_select: Selector = response.xpath("//select[contains(@id, 'ddlMes')]") | ||
month_formkey: str = month_select.attrib["name"] | ||
|
||
chosen_month: int = int( | ||
month_select.xpath("./option[contains(@selected, 'selected')]/@value").get() | ||
) | ||
|
||
first_day_of_start_date_month: date = date( | ||
self.start_date.year, self.start_date.month, 1 | ||
) | ||
|
||
month: int | ||
for month in range(1, 13): | ||
first_day_of_month: date = date(year, month, 1) | ||
if ( | ||
first_day_of_month < first_day_of_start_date_month | ||
or self.end_date < first_day_of_month | ||
): | ||
continue | ||
|
||
current_year_month: tuple[int, int] = (year, month) | ||
|
||
if month == chosen_month: | ||
yield from self.parse_editions_list(response, current_year_month) | ||
continue | ||
|
||
formdata: dict[str, str] = { | ||
"__EVENTTARGET": month_formkey, | ||
"__EVENTARGUMENT": "", | ||
year_formkey: str(year), | ||
month_formkey: str(month), | ||
} | ||
yield FormRequest.from_response( | ||
response, | ||
formdata=formdata, | ||
callback=self.parse_editions_list, | ||
cb_kwargs={ | ||
"current_year_month": current_year_month, | ||
}, | ||
# We are isolating cookiejar per name-year-month combination | ||
# to avoid interference between concurrent requests | ||
meta={"cookiejar": f"{self.name}_{year}_{month}"}, | ||
) | ||
|
||
def parse_editions_list( | ||
self, | ||
response: TextResponse, | ||
current_year_month: tuple[int, int], | ||
current_page: int = 1, | ||
) -> Generator: | ||
|
||
year_select: Selector = response.xpath("//select[contains(@id, 'ddlAno')]") | ||
year_formkey: str = year_select.attrib["name"] | ||
|
||
month_select: Selector = response.xpath("//select[contains(@id, 'ddlMes')]") | ||
month_formkey: str = month_select.attrib["name"] | ||
|
||
row: Selector | ||
file_urls: list[str] | ||
year: int | ||
month: int | ||
|
||
year, month = current_year_month | ||
|
||
for row in response.xpath( | ||
"//ancestor::a[span[contains(@id, '_grdArquivos_')]]" | ||
): | ||
raw_string: str = row.xpath("./span/text()").get() | ||
date_string_from_text: str = raw_string.split()[-1] | ||
gazette_date: date = self._parse_date(date_string_from_text) | ||
|
||
if not gazette_date: | ||
self.logger.warning( | ||
f"No valid date could be extracted from '{raw_string}'" | ||
) | ||
continue | ||
|
||
if gazette_date > self.end_date: | ||
continue | ||
elif gazette_date < self.start_date: | ||
return | ||
|
||
if gazette_date.timetuple()[:2] != current_year_month: | ||
self.logger.warning( | ||
f"Found {gazette_date.isoformat()} gazette while querying" | ||
f" for {current_year_month[0]}-{current_year_month[1]:02}" | ||
f" period. Skipping..." | ||
) | ||
continue | ||
|
||
url: str = response.urljoin(row.attrib["href"]) | ||
|
||
file_urls = self.data_by_monthly_date_by_date.setdefault( | ||
current_year_month, {} | ||
).setdefault(gazette_date, []) | ||
|
||
if url not in file_urls: | ||
# We use this strategy to avoid duplicates while maintaining row order | ||
file_urls.append(url) | ||
|
||
number_of_pages: int = len( | ||
response.xpath("//ul[contains(@class, 'pagination')]/li").getall() | ||
) | ||
|
||
if current_page < number_of_pages: | ||
formdata = { | ||
"__EVENTARGUMENT": f"Page${current_page + 1}", | ||
"__EVENTTARGET": "ctl00$conteudo$ucPesquisarDiarioOficial$grdArquivos", | ||
year_formkey: str(year), | ||
month_formkey: str(month), | ||
} | ||
|
||
yield FormRequest.from_response( | ||
response, | ||
formdata=formdata, | ||
callback=self.parse_editions_list, | ||
cb_kwargs={ | ||
"current_year_month": current_year_month, | ||
"current_page": current_page + 1, | ||
}, | ||
# We keep using the same cookiejar for the name_year_month combination | ||
# because, if we don't, it can interfere with the paging data for | ||
# a different name_year_month combination | ||
meta={"cookiejar": f"{self.name}_{year}_{month}"}, | ||
) | ||
else: | ||
current_year_month_data: dict[ | ||
date, # gazette_date | ||
list[ | ||
str, # url | ||
], | ||
] = self.data_by_monthly_date_by_date.get(current_year_month, {}) | ||
for gazette_date, file_urls in current_year_month_data.items(): | ||
yield Gazette( | ||
date=gazette_date, | ||
is_extra_edition=False, | ||
file_urls=file_urls, | ||
power="executive", | ||
) | ||
|
||
def _parse_date(self, raw_date: str) -> date: | ||
return datetime.strptime(raw_date, "%d/%m/%Y").date() |