diff --git a/data_collection/gazette/spiders/base/dosp.py b/data_collection/gazette/spiders/base/dosp.py index 4ff977ca2..35de61a48 100644 --- a/data_collection/gazette/spiders/base/dosp.py +++ b/data_collection/gazette/spiders/base/dosp.py @@ -1,53 +1,42 @@ -import base64 -import datetime -import json -import re +from base64 import b64encode +from datetime import datetime +from json import loads import scrapy -from dateutil.rrule import WEEKLY, rrule from gazette.items import Gazette from gazette.spiders.base import BaseGazetteSpider class DospGazetteSpider(BaseGazetteSpider): - allowed_domains = ["dosp.com.br", "imprensaoficialmunicipal.com.br"] - # Must be defined into child classes code = None start_date = None - def _dosp_request(self, start_date, end_date): - for date in rrule(freq=WEEKLY, dtstart=start_date, until=end_date): - from_date = date.strftime("%Y-%m-%d") - to_date = date + datetime.timedelta(days=6) - to_date = to_date.strftime("%Y-%m-%d") - - yield scrapy.Request( - f"https://dosp.com.br/api/index.php/dioedata.js/{self.code}/{from_date}/{to_date}?callback=dioe" - ) + allowed_domains = ["dosp.com.br"] def start_requests(self): - yield from self._dosp_request(self.start_date, self.end_date) + yield scrapy.Request(f"https://dosp.com.br/api/index.php/dioe.js/{self.code}") def parse(self, response): - # The response are in a javascript format, then needs some clean up - data = json.loads(response.text[6:-2]) - - for item in data["data"]: - code = item["iddo"] - code = str(code).encode("ascii") - pdf_code = base64.b64encode(code).decode("ascii") - file_url = f"https://dosp.com.br/exibe_do.php?i={pdf_code}" - date = datetime.datetime.strptime(item["data"], "%Y-%m-%d").date() - raw_edition_number = re.search(r"(\d+)([a-zA-Z]*)", item["edicao_do"]) - edition_number = raw_edition_number.group(1) - - if self.start_date <= date <= self.end_date: + json_text = ( + response.css("p::text").get().replace("parseResponse(", "") + ).replace(");", "") + + json_text = loads(json_text) + + for diarios in json_text["data"]: + data = datetime.strptime(diarios["data"], "%Y-%m-%d").date() + code_link = str(diarios["iddo"]).encode("ascii") + code_link = b64encode(code_link).decode("ascii") + + if self.start_date <= data <= self.end_date: yield Gazette( - date=date, - file_urls=[file_url], - edition_number=edition_number, - power="executive_legislative", - is_extra_edition=raw_edition_number.group(2) != "", + date=data, + edition_number=diarios["edicao_do"], + file_urls=[ + f"https://dosp.com.br/exibe_do.php?i={code_link}.pdf", + ], + is_extra_edition=diarios["flag_extra"] > 0, + power="executive", ) diff --git a/data_collection/gazette/spiders/mg/mg_uberaba.py b/data_collection/gazette/spiders/mg/mg_uberaba_2003.py similarity index 55% rename from data_collection/gazette/spiders/mg/mg_uberaba.py rename to data_collection/gazette/spiders/mg/mg_uberaba_2003.py index 3639d6bdc..f60633add 100644 --- a/data_collection/gazette/spiders/mg/mg_uberaba.py +++ b/data_collection/gazette/spiders/mg/mg_uberaba_2003.py @@ -3,36 +3,21 @@ from scrapy import FormRequest from gazette.items import Gazette -from gazette.spiders.base.dosp import DospGazetteSpider +from gazette.spiders.base import BaseGazetteSpider -class MgUberabaSpider(DospGazetteSpider): +class MgUberabaSpider(BaseGazetteSpider): TERRITORY_ID = "3170107" - name = "mg_uberaba" + name = "mg_uberaba_2003" - code = 2364 start_date = dt.date(2003, 4, 25) + end_date = dt.date(2021, 9, 1) def start_requests(self): - # Gazettes older than this date didn't use DOSP system - older_collection_end_date = dt.date(2021, 9, 2) - - if self.end_date >= older_collection_end_date: - start_date = max([older_collection_end_date, self.start_date]) - end_date = self.end_date - yield from self._dosp_request(start_date, end_date) - - if self.start_date < older_collection_end_date: - start_date = self.start_date - end_date = min([older_collection_end_date, self.end_date]) - yield from self._older_collection_request(start_date, end_date) - - def _older_collection_request(self, start_date, end_date): - for year in range(start_date.year, end_date.year + 1): + for year in range(self.start_date.year, self.end_date.year + 1): yield FormRequest( url="http://www.uberaba.mg.gov.br/portal/listImagesHtml", method="POST", - callback=self.parse_older_collection, formdata={ "desc": "1", "type": "1", @@ -42,16 +27,24 @@ def _older_collection_request(self, start_date, end_date): "types": "gif,jpg,png,bmp,tif,dxf,swf,dcr,mov,qt,ram,rm,avi,mpg,mpeg,asf,flv,pdf,doc,docx,xls,xlsx,zip,rar,txt,cdr,ai,eps,ppt,pptx,pot,psd,wmv", "listAll": "1", }, - cb_kwargs={"start_date": start_date, "end_date": end_date}, ) - def parse_older_collection(self, response, start_date, end_date): + def parse(self, response): gazettes = response.css(".claGaleriaBoxFileTable") for gazette in gazettes: raw_date = gazette.css("::text").re_first(r"(\d{2}-\d{2}-\d{4})") - gazette_date = dt.datetime.strptime(raw_date, "%d-%m-%Y").date() - if gazette_date < start_date or gazette_date > end_date: + try: + gazette_date = dt.datetime.strptime(raw_date, "%d-%m-%Y").date() + except Exception: + self.logger.error( + f"Gazette date can't be parsed from gazette named \"{gazette.css('::text').get()}\"" + ) + continue + + if gazette_date > self.end_date: continue + elif gazette_date < self.start_date: + return gazette_url = response.urljoin( gazette.css("img::attr(onclick)").re_first(r"download\(\'(.*)\'\)") diff --git a/data_collection/gazette/spiders/mg/mg_uberaba_2021.py b/data_collection/gazette/spiders/mg/mg_uberaba_2021.py new file mode 100644 index 000000000..62737d4ac --- /dev/null +++ b/data_collection/gazette/spiders/mg/mg_uberaba_2021.py @@ -0,0 +1,10 @@ +import datetime as dt + +from gazette.spiders.base.dosp import DospGazetteSpider + + +class MgUberabaSpider(DospGazetteSpider): + TERRITORY_ID = "3170107" + name = "mg_uberaba_2021" + code = 2364 + start_date = dt.date(2021, 9, 1)