Skip to content

Commit

Permalink
Adiciona novo spider base ATENDE e alguns de seus municípios (#1145)
Browse files Browse the repository at this point in the history
  • Loading branch information
trevineju authored May 16, 2024
2 parents 87bb0f7 + b37da6c commit 0edf4a4
Show file tree
Hide file tree
Showing 12 changed files with 190 additions and 55 deletions.
92 changes: 92 additions & 0 deletions data_collection/gazette/spiders/base/atende_layoutdois.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
import re

import dateparser
from scrapy import FormRequest

from gazette.items import Gazette
from gazette.spiders.base import BaseGazetteSpider


class BaseAtendeL2Spider(BaseGazetteSpider):
"""
Base spider for Gazettes that are available from cities listed on
https://{city_subdomain}.atende.net
This base class deals with 'Layout 2' gazette pages, usually requested
from 'https://{city_subdomain}.atende.net/diariooficial'.
"""

allowed_domains = ["atende.net"]
BASE_URL = ""

# Must be defined into child classes
city_subdomain = ""

def start_requests(self):
self.BASE_URL = f"https://{self.city_subdomain}.atende.net/diariooficial/edicao/pagina/atende.php"

yield FormRequest(
url=self.BASE_URL,
method="GET",
formdata=self.get_params("pagina", 1),
cb_kwargs={"page": 1},
)

def parse(self, response, page):
for item in response.css("div.nova_listagem div.linha"):
date_raw = item.css("div.data::text").get()
date = dateparser.parse(date_raw, languages=["pt"]).date()

if date > self.end_date:
continue
if date < self.start_date:
return

edition_type = item.css("div.tipo::text").get()
is_extra = bool(
re.search(
r"suplementar | retificação | extraordinária | extra",
edition_type,
re.IGNORECASE,
)
)

edition_number = item.css("div.titulo::text").re_first(r"\d+")
download_url = item.css("button::attr(data-link)")[-1].get()

yield Gazette(
date=date,
edition_number=edition_number,
is_extra_edition=is_extra,
file_urls=[download_url],
power="executive_legislative",
)

if page < self.get_last_page(response):
yield FormRequest(
url=self.BASE_URL,
method="GET",
formdata=self.get_params("pagina", page + 1),
cb_kwargs={"page": page + 1},
)

def get_params(self, filtro, value):
params = {
"rot": "54015",
"aca": "101",
"ajax": "t",
"processo": "loadPluginDiarioOficial",
}
if filtro == "pagina":
params[
"parametro"
] = f'{{"codigoPlugin":1,"filtroPlugin":{{"pagina":"{value}"}}}}'
elif filtro == "edicao":
params[
"parametro"
] = f'{{"codigoPlugin":2,"filtroPlugin":{{"codigoEdicao":"{value}"}}}}'

return params

def get_last_page(self, response):
pages = response.css("div#paginacao li.dst button::attr(value)").getall()[-1]
return int(pages)
11 changes: 11 additions & 0 deletions data_collection/gazette/spiders/rs/rs_bento_goncalves.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from datetime import date

from gazette.spiders.base.atende_layoutdois import BaseAtendeL2Spider


class RsBentoGoncalvesSpider(BaseAtendeL2Spider):
TERRITORY_ID = "4302105"
name = "rs_bento_goncalves"
start_date = date(2019, 4, 1) # Edição 1124
city_subdomain = "bentogoncalves"
# power
1 change: 1 addition & 0 deletions data_collection/gazette/spiders/rs/rs_camaqua.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@ class RsCamaquaSpider(BaseInstarSpider):
allowed_domains = ["camaqua.rs.gov.br"]
base_url = "https://www.camaqua.rs.gov.br/portal/diario-oficial"
start_date = date(2019, 7, 25)
end_date = date(2023, 7, 19)
10 changes: 10 additions & 0 deletions data_collection/gazette/spiders/rs/rs_candelaria.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from datetime import date

from gazette.spiders.base.atende_layoutdois import BaseAtendeL2Spider


class RsCandelariaSpider(BaseAtendeL2Spider):
TERRITORY_ID = "4304200"
name = "rs_candelaria"
start_date = date(2023, 5, 7) # Edição 1
city_subdomain = "candelaria"
10 changes: 10 additions & 0 deletions data_collection/gazette/spiders/rs/rs_dois_irmaos.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from datetime import date

from gazette.spiders.base.atende_layoutdois import BaseAtendeL2Spider


class RsDoisIrmaosSpider(BaseAtendeL2Spider):
TERRITORY_ID = "4306403"
name = "rs_dois_irmaos"
start_date = date(2020, 1, 7) # Edição 1
city_subdomain = "doisirmaos"
10 changes: 10 additions & 0 deletions data_collection/gazette/spiders/rs/rs_estrela.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from datetime import date

from gazette.spiders.base.atende_layoutdois import BaseAtendeL2Spider


class RsEstrelaSpider(BaseAtendeL2Spider):
TERRITORY_ID = "4307807"
name = "rs_estrela"
start_date = date(2021, 3, 29) # Edição 1
city_subdomain = "estrela"
61 changes: 6 additions & 55 deletions data_collection/gazette/spiders/rs/rs_gravatai.py
Original file line number Diff line number Diff line change
@@ -1,60 +1,11 @@
from dateparser import parse
from scrapy import Request
from datetime import date

from gazette.items import Gazette
from gazette.spiders.base import BaseGazetteSpider
from gazette.spiders.base.atende_layoutdois import BaseAtendeL2Spider


class RsGravataiSpider(BaseGazetteSpider):
class RsGravataiSpider(BaseAtendeL2Spider):
TERRITORY_ID = "4309209"
name = "rs_gravatai"
allowed_domains = ["gravatai.atende.net"]
start_urls = ["https://gravatai.atende.net/?pg=diariooficial"]

extra_editions_options = ("Suplementar", "Retificação")

def parse(self, response):
"""
@url https://gravatai.atende.net/?pg=diariooficial
@returns requests 1
"""

last_page_number_css = "#paginacao > ul > li:nth-child(7) > button::attr(value)"
last_page_number = int(response.css(last_page_number_css).extract_first())

for page_number in range(1, last_page_number + 1):
yield Request(
f"https://gravatai.atende.net/?pg=diariooficial&pagina={page_number}",
callback=self.parse_gazette,
)

def parse_gazette(self, response):
"""
@url https://gravatai.atende.net/?pg=diariooficial&pagina=1
@returns items 1
@scrapes date file_urls is_extra_edition power
"""

for element in response.css(".nova_listagem > .linha"):
info = element.css(".info")

is_extra_edition = (
info.css(".tipo::text").extract_first() in self.extra_editions_options
)

date = parse(
info.css(".data::text").extract_first(), languages=["pt"]
).date()

code = element.css(".opcoes > button::attr(data-codigo)").extract_first()
url = (
"https://gravatai.atende.net/atende.php?rot=54002&aca=737"
f"&processo=download&codigo={code}"
)

yield Gazette(
date=date,
file_urls=[url],
is_extra_edition=is_extra_edition,
power="executive",
)
start_date = date(2015, 5, 4) # Edição 1
city_subdomain = "gravatai"
power = "executive"
10 changes: 10 additions & 0 deletions data_collection/gazette/spiders/rs/rs_horizontina.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from datetime import date

from gazette.spiders.base.atende_layoutdois import BaseAtendeL2Spider


class RsHorizontinaSpider(BaseAtendeL2Spider):
TERRITORY_ID = "4309605"
name = "rs_horizontina"
start_date = date(2016, 6, 15) # Edição 1
city_subdomain = "horizontina"
10 changes: 10 additions & 0 deletions data_collection/gazette/spiders/rs/rs_panambi.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from datetime import date

from gazette.spiders.base.atende_layoutdois import BaseAtendeL2Spider


class RsPanambiSpider(BaseAtendeL2Spider):
TERRITORY_ID = "4313904"
name = "rs_panambi"
start_date = date(2021, 4, 14) # Edição 1
city_subdomain = "panambi"
10 changes: 10 additions & 0 deletions data_collection/gazette/spiders/rs/rs_santa_rosa.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from datetime import date

from gazette.spiders.base.atende_layoutdois import BaseAtendeL2Spider


class RsSantaRosaSpider(BaseAtendeL2Spider):
TERRITORY_ID = "4317202"
name = "rs_santa_rosa"
start_date = date(2022, 8, 23) # Edição 1
city_subdomain = "santarosa"
10 changes: 10 additions & 0 deletions data_collection/gazette/spiders/rs/rs_sao_joao_do_polesine.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from datetime import date

from gazette.spiders.base.atende_layoutdois import BaseAtendeL2Spider


class RsSaoJoaoDoPolesineSpider(BaseAtendeL2Spider):
TERRITORY_ID = "4318432"
name = "rs_sao_joao_do_polesine"
start_date = date(2021, 5, 28) # Edição 1
city_subdomain = "saojoaodopolesine"
10 changes: 10 additions & 0 deletions data_collection/gazette/spiders/rs/rs_sobradinho.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from datetime import date

from gazette.spiders.base.atende_layoutdois import BaseAtendeL2Spider


class RsSobradinhoSpider(BaseAtendeL2Spider):
TERRITORY_ID = "4320701"
name = "rs_sobradinho"
start_date = date(2020, 3, 5) # Edição 1
city_subdomain = "sobradinho"

0 comments on commit 0edf4a4

Please sign in to comment.