diff --git a/data_collection/gazette/spiders/es/es_vitoria.py b/data_collection/gazette/spiders/es/es_vitoria.py index e83111be8..08e9ba12c 100644 --- a/data_collection/gazette/spiders/es/es_vitoria.py +++ b/data_collection/gazette/spiders/es/es_vitoria.py @@ -1,4 +1,4 @@ -from datetime import date, datetime +from datetime import date, datetime as dt from dateutil.rrule import MONTHLY, rrule, rruleset from scrapy import FormRequest, Request @@ -13,14 +13,8 @@ class EsVitoriaSpider(BaseGazetteSpider): allowed_domains = ["diariooficial.vitoria.es.gov.br"] start_date = date(2014, 7, 21) - # When there are too many requests, the server may return - # an HTTP 406 status code when trying to download a PDF file - # - # We set `custom_settings` to avoid triggering the 406 HTTP status code - # by spreading the downloads for this spider over time - custom_settings = { - "DOWNLOAD_DELAY": 0.3, # 300 ms + "DOWNLOAD_DELAY": 0.3, "RANDOMIZE_DOWNLOAD_DELAY": True, "RETRY_HTTP_CODES": [500, 502, 503, 504, 522, 524, 408, 429, 406], } @@ -40,7 +34,7 @@ def make_year_request(self, response): monthly_dates.rrule( rrule(MONTHLY, dtstart=self.start_date, until=self.end_date, bymonthday=[1]) ) - monthly_dates.rdate(date(self.start_date.year, self.start_date.month, 1)) + monthly_dates.rdate(dt(self.start_date.year, self.start_date.month, 1)) for monthly_date in monthly_dates: formdata = {self.FORM_PARAM_YEAR: str(monthly_date.year)} @@ -49,7 +43,7 @@ def make_year_request(self, response): response, formdata=formdata, callback=self.make_month_request, - # We are isolating cookiejar like (year, month) combination + # We are isolating cookiejar in (year, month) combination # to avoid interference between concurrent requests meta={"cookiejar": (monthly_date.year, monthly_date.month)}, ) @@ -74,7 +68,7 @@ def make_month_request(self, response): def parse_editions_list(self, response, current_page=1): for row in response.xpath("//tbody//td/a[1]"): raw_date = row.css("span::text")[0].get().split()[-1] - gazette_date = datetime.strptime(raw_date, "%d/%m/%Y").date() + gazette_date = dt.strptime(raw_date, "%d/%m/%Y").date() if self.start_date <= gazette_date <= self.end_date: url = response.urljoin(row.css("a").attrib["href"]) @@ -87,24 +81,22 @@ def parse_editions_list(self, response, current_page=1): power="executive", ) - has_next_page = ( - response.css(".pagination li")[-1].css("a::text").get() is not None - ) - if has_next_page: - next_page = current_page + 1 - year, month = response.meta.get("cookiejar") - - formdata = { - self.FORM_PARAM_YEAR: str(year), - self.FORM_PARAM_MONTH: str(month), - "__EVENTTARGET": self.FORM_PARAM_PAGINATION, - "__EVENTARGUMENT": f"Page${next_page}", - } - - yield FormRequest.from_response( - response, - formdata=formdata, - callback=self.parse_editions_list, - cb_kwargs={"current_page": next_page}, - meta={"cookiejar": response.meta.get("cookiejar")}, - ) + if "pagination" in response.text: + if response.css(".pagination li")[-1].css("a::text").get(): + next_page = current_page + 1 + year, month = response.meta.get("cookiejar") + + formdata = { + self.FORM_PARAM_YEAR: str(year), + self.FORM_PARAM_MONTH: str(month), + "__EVENTTARGET": self.FORM_PARAM_PAGINATION, + "__EVENTARGUMENT": f"Page${next_page}", + } + + yield FormRequest.from_response( + response, + formdata=formdata, + callback=self.parse_editions_list, + cb_kwargs={"current_page": next_page}, + meta={"cookiejar": response.meta.get("cookiejar")}, + )