Skip to content

Commit

Permalink
Ajusta custom_settings, paginação e conversão de data
Browse files Browse the repository at this point in the history
  • Loading branch information
trevineju committed Jan 13, 2025
1 parent 62cd4bc commit ffca201
Showing 1 changed file with 24 additions and 32 deletions.
56 changes: 24 additions & 32 deletions data_collection/gazette/spiders/es/es_vitoria.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from datetime import date, datetime
from datetime import date, datetime as dt

from dateutil.rrule import MONTHLY, rrule, rruleset
from scrapy import FormRequest, Request
Expand All @@ -13,14 +13,8 @@ class EsVitoriaSpider(BaseGazetteSpider):
allowed_domains = ["diariooficial.vitoria.es.gov.br"]
start_date = date(2014, 7, 21)

# When there are too many requests, the server may return
# an HTTP 406 status code when trying to download a PDF file
#
# We set `custom_settings` to avoid triggering the 406 HTTP status code
# by spreading the downloads for this spider over time

custom_settings = {
"DOWNLOAD_DELAY": 0.3, # 300 ms
"DOWNLOAD_DELAY": 0.3,
"RANDOMIZE_DOWNLOAD_DELAY": True,
"RETRY_HTTP_CODES": [500, 502, 503, 504, 522, 524, 408, 429, 406],
}
Expand All @@ -40,7 +34,7 @@ def make_year_request(self, response):
monthly_dates.rrule(
rrule(MONTHLY, dtstart=self.start_date, until=self.end_date, bymonthday=[1])
)
monthly_dates.rdate(date(self.start_date.year, self.start_date.month, 1))
monthly_dates.rdate(dt(self.start_date.year, self.start_date.month, 1))

for monthly_date in monthly_dates:
formdata = {self.FORM_PARAM_YEAR: str(monthly_date.year)}
Expand All @@ -49,7 +43,7 @@ def make_year_request(self, response):
response,
formdata=formdata,
callback=self.make_month_request,
# We are isolating cookiejar like (year, month) combination
# We are isolating cookiejar in (year, month) combination
# to avoid interference between concurrent requests
meta={"cookiejar": (monthly_date.year, monthly_date.month)},
)
Expand All @@ -74,7 +68,7 @@ def make_month_request(self, response):
def parse_editions_list(self, response, current_page=1):
for row in response.xpath("//tbody//td/a[1]"):
raw_date = row.css("span::text")[0].get().split()[-1]
gazette_date = datetime.strptime(raw_date, "%d/%m/%Y").date()
gazette_date = dt.strptime(raw_date, "%d/%m/%Y").date()

if self.start_date <= gazette_date <= self.end_date:
url = response.urljoin(row.css("a").attrib["href"])
Expand All @@ -87,24 +81,22 @@ def parse_editions_list(self, response, current_page=1):
power="executive",
)

has_next_page = (
response.css(".pagination li")[-1].css("a::text").get() is not None
)
if has_next_page:
next_page = current_page + 1
year, month = response.meta.get("cookiejar")

formdata = {
self.FORM_PARAM_YEAR: str(year),
self.FORM_PARAM_MONTH: str(month),
"__EVENTTARGET": self.FORM_PARAM_PAGINATION,
"__EVENTARGUMENT": f"Page${next_page}",
}

yield FormRequest.from_response(
response,
formdata=formdata,
callback=self.parse_editions_list,
cb_kwargs={"current_page": next_page},
meta={"cookiejar": response.meta.get("cookiejar")},
)
if "pagination" in response.text:
if response.css(".pagination li")[-1].css("a::text").get():
next_page = current_page + 1
year, month = response.meta.get("cookiejar")

formdata = {
self.FORM_PARAM_YEAR: str(year),
self.FORM_PARAM_MONTH: str(month),
"__EVENTTARGET": self.FORM_PARAM_PAGINATION,
"__EVENTARGUMENT": f"Page${next_page}",
}

yield FormRequest.from_response(
response,
formdata=formdata,
callback=self.parse_editions_list,
cb_kwargs={"current_page": next_page},
meta={"cookiejar": response.meta.get("cookiejar")},
)

0 comments on commit ffca201

Please sign in to comment.