Ajusta custom_settings, paginação e conversão de data

okfn-brasil · Jan 13, 2025 · ffca201 · ffca201
1 parent 62cd4bc
commit ffca201
Showing 1 changed file with 24 additions and 32 deletions.
diff --git a/data_collection/gazette/spiders/es/es_vitoria.py b/data_collection/gazette/spiders/es/es_vitoria.py
@@ -1,4 +1,4 @@
-from datetime import date, datetime
+from datetime import date, datetime as dt
 
 from dateutil.rrule import MONTHLY, rrule, rruleset
 from scrapy import FormRequest, Request
@@ -13,14 +13,8 @@ class EsVitoriaSpider(BaseGazetteSpider):
     allowed_domains = ["diariooficial.vitoria.es.gov.br"]
     start_date = date(2014, 7, 21)
 
-    # When there are too many requests, the server may return
-    # an HTTP 406 status code when trying to download a PDF file
-    #
-    # We set `custom_settings` to avoid triggering the 406 HTTP status code
-    # by spreading the downloads for this spider over time
-
     custom_settings = {
-        "DOWNLOAD_DELAY": 0.3,  # 300 ms
+        "DOWNLOAD_DELAY": 0.3,
         "RANDOMIZE_DOWNLOAD_DELAY": True,
         "RETRY_HTTP_CODES": [500, 502, 503, 504, 522, 524, 408, 429, 406],
     }
@@ -40,7 +34,7 @@ def make_year_request(self, response):
         monthly_dates.rrule(
             rrule(MONTHLY, dtstart=self.start_date, until=self.end_date, bymonthday=[1])
         )
-        monthly_dates.rdate(date(self.start_date.year, self.start_date.month, 1))
+        monthly_dates.rdate(dt(self.start_date.year, self.start_date.month, 1))
 
         for monthly_date in monthly_dates:
             formdata = {self.FORM_PARAM_YEAR: str(monthly_date.year)}
@@ -49,7 +43,7 @@ def make_year_request(self, response):
                 response,
                 formdata=formdata,
                 callback=self.make_month_request,
-                # We are isolating cookiejar like (year, month) combination
+                # We are isolating cookiejar in (year, month) combination
                 # to avoid interference between concurrent requests
                 meta={"cookiejar": (monthly_date.year, monthly_date.month)},
             )
@@ -74,7 +68,7 @@ def make_month_request(self, response):
     def parse_editions_list(self, response, current_page=1):
         for row in response.xpath("//tbody//td/a[1]"):
             raw_date = row.css("span::text")[0].get().split()[-1]
-            gazette_date = datetime.strptime(raw_date, "%d/%m/%Y").date()
+            gazette_date = dt.strptime(raw_date, "%d/%m/%Y").date()
 
             if self.start_date <= gazette_date <= self.end_date:
                 url = response.urljoin(row.css("a").attrib["href"])
@@ -87,24 +81,22 @@ def parse_editions_list(self, response, current_page=1):
                     power="executive",
                 )
 
-        has_next_page = (
-            response.css(".pagination li")[-1].css("a::text").get() is not None
-        )
-        if has_next_page:
-            next_page = current_page + 1
-            year, month = response.meta.get("cookiejar")
-
-            formdata = {
-                self.FORM_PARAM_YEAR: str(year),
-                self.FORM_PARAM_MONTH: str(month),
-                "__EVENTTARGET": self.FORM_PARAM_PAGINATION,
-                "__EVENTARGUMENT": f"Page${next_page}",
-            }
-
-            yield FormRequest.from_response(
-                response,
-                formdata=formdata,
-                callback=self.parse_editions_list,
-                cb_kwargs={"current_page": next_page},
-                meta={"cookiejar": response.meta.get("cookiejar")},
-            )
+        if "pagination" in response.text:
+            if response.css(".pagination li")[-1].css("a::text").get():
+                next_page = current_page + 1
+                year, month = response.meta.get("cookiejar")
+
+                formdata = {
+                    self.FORM_PARAM_YEAR: str(year),
+                    self.FORM_PARAM_MONTH: str(month),
+                    "__EVENTTARGET": self.FORM_PARAM_PAGINATION,
+                    "__EVENTARGUMENT": f"Page${next_page}",
+                }
+
+                yield FormRequest.from_response(
+                    response,
+                    formdata=formdata,
+                    callback=self.parse_editions_list,
+                    cb_kwargs={"current_page": next_page},
+                    meta={"cookiejar": response.meta.get("cookiejar")},
+                )