Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add scraper for omdena and methods for retrieving datasets, projects and blogs #1107

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions dev-documentation.md
Original file line number Diff line number Diff line change
Expand Up @@ -1962,6 +1962,23 @@ The `cars` class includes methods for retrieving job listings:
| --------------- | ------------------------------------------------------------------- |
| `.fetch_cars()` | Retrieve car listings from the website for a specified single page. |

---

#### Omdena

```python
from scrape_up.omdena import Omdena

omdena = Omdena()
print(omdena.get_projects())
```

| Methods | Details |
| --------------------------- | ---------------------------------------------------------------------------------------------------- |
| `.get_datasets()` | Returns a dictionary of datasets in Omdena. |
| `.get_projects()` | Returns a dictionary of latest ongoing projects in Omdena. |
| `.get_blogs()` | Returns a dictionary of latest blogs in Omdena. |
---

#### Letterboxd

Expand Down
13 changes: 13 additions & 0 deletions docs/modules/omdena.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
Create an instance of `Omdena` class.

```python
from scrape_up.omdena import Omdena

omdena = Omdena()
```

| Methods | Details |
| --------------------------- | ---------------------------------------------------------------------------------------------------- |
| `.get_datasets()` | Returns a dictionary of datasets in Omdena. |
| `.get_projects()` | Returns a dictionary of latest ongoing projects in Omdena. |
| `.get_blogs()` | Returns a dictionary of latest blogs in Omdena. |
3 changes: 3 additions & 0 deletions src/scrape_up/omdena/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from .omdena import Omdena

__all__ = ["Omdena"]
144 changes: 144 additions & 0 deletions src/scrape_up/omdena/omdena.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
import json
from bs4 import BeautifulSoup as Soup
from scrape_up.config.request_config import RequestConfig, get


class Omdena:
"""
Class - `Omdena`

| Methods | Details |
| --------------------------- | ---------------------------------------------------------------------------------------------------- |
| `.get_datasets()` | Returns a dictionary of datasets in Omdena. |
| `.get_projects()` | Returns a dictionary of latest ongoing projects in Omdena. |
| `.get_blogs()` | Returns a dictionary of latest blogs in Omdena. |

"""

def __init__(self, *, config: RequestConfig = RequestConfig()):
self.config = config
self.url = f"https://www.omdena.com/"

def get_datasets(self) -> dict:
"""
Class - `Omdena`

Example:
```
omdena = Omdena()
omdena.get_datasets()
```

Returns the list of available datasets in Omdena.:
```python
{'datasets': ['https://datasets.omdena.com//dataset/flood-dataset-(malaysia)', 'https://datasets.omdena.com//dataset/2007-2022-homeless-populations-by-state-(usa)', 'https://datasets.omdena.com//dataset/homelessness-in-the-united-states-(2007-2022)', 'https://datasets.omdena.com//dataset/global-wheat-head-detection-(gwhd)', 'https://datasets.omdena.com//dataset/covid-radiology-images', 'https://datasets.omdena.com//dataset/yearly-economics-and-unemployment-(pakistan)', 'https://datasets.omdena.com//dataset/twitter-data-with-pq-scores']}
```
"""
try:
self.url = f"https://datasets.omdena.com"
req = get(self.url, self.config)
page_soup = Soup(req.content, "html.parser")

data = page_soup.find_all("div", attrs={"class": f"dataset-grid"})
datset_list = []

for card in data:
for i in card.find_all("a"):
datset_list.append(f"{self.url}{i.get('href')}")

return {"datasets": datset_list}

except Exception:
return None

def get_blogs(self) -> dict:
"""
Class - `Omdena`

Example:
```
omdena = Omdena()
omdena.get_blogs()
```

Returns the list of latest blogs in Omdena.:
```python
{'blogs': [{
'title': 'Revolutionizing Short-term Traffic Congestion Prediction with Machine Learning | AI Insights | Omdena Success Story',
'description': 'Explore how our project leveraged machine learning and computer vision to predict short-term traffic congestion, revolutionizing urban traffic management.',
'url': 'https://cmsnew.omdena.com/blog/predicting-short-term-traffic-congestion-on-urban-roads-using-machine-learning/',
'site_name': 'Omdena | Building AI Solutions for Real-World Problems',
'image': [{'width': 715,
'height': 520,
'url': 'https://cmsnew.omdena.com/wp-content/uploads/2023/05/Vehicle-and-Vehicle-Direction-Detection.jpeg',
'type': 'image/jpeg'}]}
}]
```
"""
try:
req = get(f"{self.url}/blog", self.config)
page_soup = Soup(req.content, "html.parser")

data = page_soup.find("script", attrs={"id": f"__NEXT_DATA__"}).text
data = json.loads(data)

keys_ = data["props"]["pageProps"]["articles"][0]["yoast_head_json"].keys()
blogs_data = []

for project in data["props"]["pageProps"]["articles"]:
project_data_dict = {}
for key in keys_:
if key[:3]=="og_" and key not in ["og_locale", "og_type"]:
project_data_dict[key.replace("og_", "")] = project["yoast_head_json"][key]
blogs_data.append(project_data_dict)

return {"blogs": blogs_data}

except Exception:
return None

def get_projects(self) -> dict:
"""
Class - `Omdena`

Example:
```
omdena = Omdena()
omdena.get_projects()
```

Returns the list of latest ongoing projects in Omdena.:
```python
{'projects': [{
'title': 'Building Real-Time Anomaly Detection for Traveling Agency Transactions',
'description': 'Join Omdena Top Talent Project to build a real-time anomaly detection system for analyzing transactions on travel agency websites by leveraging data analysis and AI technologies.',
'url': 'https://cmsnew.omdena.com/projects/building-real-time-anomaly-detection-for-traveling-agency-transactions/',
'site_name': 'Omdena | Building AI Solutions for Real-World Problems',
'image': [{'width': 1600,
'height': 1066,
'url': 'https://cmsnew.omdena.com/wp-content/uploads/2024/06/traveling-agency-transactions.jpeg',
'type': 'image/jpeg'}]}
}]
```
"""
try:
req = get(f"{self.url}/projects", self.config)
page_soup = Soup(req.content, "html.parser")

data = page_soup.find("script", attrs={"id": f"__NEXT_DATA__"}).text
data = json.loads(data)

keys_ = data["props"]["pageProps"]["projects"][0]["yoast_head_json"].keys()
projects_data = []

for project in data["props"]["pageProps"]["projects"]:
project_data_dict = {}
for key in keys_:
if key[:3]=="og_" and key not in ["og_locale", "og_type"]:
project_data_dict[key.replace("og_", "")] = project["yoast_head_json"][key]
projects_data.append(project_data_dict)

return {"projects": projects_data}

except Exception:
return None