diff --git a/.gitignore b/.gitignore old mode 100644 new mode 100755 index 68bc17f..d89d1da --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,6 @@ +# Jupyter Books +_build/ + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/notebooks/openalex_preprints.ipynb b/notebooks/openalex_preprints.ipynb deleted file mode 100644 index 2b7ebcc..0000000 --- a/notebooks/openalex_preprints.ipynb +++ /dev/null @@ -1,87 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## How to get all preprints which have not been published in a journal yet?\n", - "\n", - "We use [OpenAlex](https://openalex.org) to retrieve all articles which are a preprint, but have so far not been published by a peer-reviewed journal." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from pyalex import Works, Authors, Sources, Institutions, Concepts, Publishers, Funders\n", - "from itertools import chain\n", - "import pandas as pd\n", - "import pyalex" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Variables reduce the size of the output and the time required for execution\n", - "year = 2020\n", - "n_max = None # when set to None all papers are queried" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "query = Works().filter(type=\"article\", publication_year=year, primary_location={'version': 'submittedVersion'}).sort(cited_by_count=\"desc\")\n", - "\n", - "data = []\n", - "for item in chain(*query.paginate(per_page=200, n_max=n_max)):\n", - " title = item.get('title', None)\n", - " publication_date = item.get('publication_date', None)\n", - " doi = item.get('doi', None)\n", - " cited_by_count = item.get('cited_by_count', None)\n", - " locations_count = item.get('locations_count', None)\n", - " host_organization_source = item.get('primary_location', {}).get('source', {})\n", - " host_organization = None\n", - " if host_organization_source:\n", - " host_organization = host_organization_source.get('display_name')\n", - " \n", - " data.append({'Title': title, 'Publication Date': publication_date, 'DOI': doi, 'Host Organization': host_organization, 'Cited by Count': cited_by_count, 'Locations Count': locations_count})\n", - "\n", - "df = pd.DataFrame.from_dict(data)\n", - "df.to_csv(f'../results/openalex_preprints_{year}.csv')\n", - " " - ] - } - ], - "metadata": { - "interpreter": { - "hash": "dacd09b87f275de77aaa2661dde3428b7c30cfb80959b39c223c1792ae834a1a" - }, - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.4" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/notebooks/openalex_preprints_step_by_step.ipynb b/notebooks/openalex_preprints_step_by_step.ipynb deleted file mode 100644 index 06aa885..0000000 --- a/notebooks/openalex_preprints_step_by_step.ipynb +++ /dev/null @@ -1,360 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Prepint Analysis - How to get all preprints of a topics?\n", - "\n", - "We use [OpenAlex](https://openalex.org) to retrieve all articles which are a preprint, but have so far not been published by a peer-reviewed journal." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 1. Load libraries & define helper functions" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "from pyalex import Works, Authors, Sources, Institutions, Concepts, Publishers, Funders\n", - "from itertools import chain\n", - "import pandas as pd\n", - "import pyalex" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "# Determine if any of the locations (journals) the paper is already published\n", - "def is_any_location_published(locations):\n", - " for location in locations:\n", - " if location['version'] == 'publishedVersion':\n", - " return True\n", - " return False" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "# Combine all authos\n", - "def join_authors(list_of_authors):\n", - " return ', '.join([author['author']['display_name'] for author in list_of_authors])" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "# Extract key information from the locations\n", - "def join_locations(list_of_locations):\n", - " summary = []\n", - " for location in list_of_locations:\n", - " if location['source']:\n", - " summary.append(f\"{location['version']}: {location['source']['host_organization_name']} - {location['landing_page_url']}\")\n", - " else:\n", - " summary.append(f\"{location['version']} - {location['landing_page_url']}\")\n", - " return ', '.join(summary)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 2. Set the Topic & Year\n", - "\n", - "Set the year and the number of papers you want to obtain" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [], - "source": [ - "# Variables reduce the size of the output and the time required for execution\n", - "topic = 'COVID'\n", - "year = 2023\n", - "n_max = 500 # when set to None all papers are queried" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 3. Get the preprints\n", - "\n", - "Run te following code to get the preprints for the specified parameters" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [], - "source": [ - "query = Works().search(topic).filter(type=\"article\", publication_year=year, primary_location={'version': 'submittedVersion'}, locations={'is_published': False}).sort(cited_by_count=\"desc\")\n", - "\n", - "preprints = []\n", - "\n", - "# Iterate over all query results\n", - "for item in chain(*query.paginate(per_page=200, n_max=n_max)):\n", - " \n", - " # Get key properties\n", - " oa_id = item.get('id', None)\n", - " title = item.get('title', None)\n", - " publication_date = item.get('publication_date', None)\n", - " doi = item.get('doi', None)\n", - " cited_by_count = item.get('cited_by_count', None)\n", - " locations_count = item.get('locations_count', None)\n", - " \n", - " # Join all authors\n", - " authors = join_authors(item['authorships']) \n", - " locations = item.get('locations', None)\n", - " locations_overview = join_locations(item['locations'])\n", - " \n", - " # Only append the paper to the preprints if is not published in any other journal\n", - " if locations_count == 1 or not is_any_location_published(locations):\n", - " preprints.append({'id': oa_id, 'title': title, 'publication_date': publication_date, 'doi': doi,\n", - " 'cited': cited_by_count, 'authors': authors,\n", - " 'locations': locations_overview, 'location_count': locations_count})\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 4. Store the data" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - " | id | \n", - "title | \n", - "publication_date | \n", - "doi | \n", - "cited | \n", - "authors | \n", - "locations | \n", - "location_count | \n", - "
---|---|---|---|---|---|---|---|---|
0 | \n", - "https://openalex.org/W4318909870 | \n", - "Overcoming Vaccine Skepticism in Pakistan: A C... | \n", - "2023-02-02 | \n", - "https://doi.org/10.5281/zenodo.7597141 | \n", - "12 | \n", - "Bibi Aisha Sadiqa | \n", - "submittedVersion: European Organization for Nu... | \n", - "1 | \n", - "
1 | \n", - "https://openalex.org/W4386199227 | \n", - "Clinical Rationale for SARS-CoV-2 Base Spike P... | \n", - "2023-08-25 | \n", - "https://doi.org/10.5281/zenodo.8286460 | \n", - "5 | \n", - "Peter A. McCullough, Cade Wynn, Brian C Procter | \n", - "submittedVersion: European Organization for Nu... | \n", - "1 | \n", - "
2 | \n", - "https://openalex.org/W3197911323 | \n", - "The Political Economy of a Modern Pandemic: As... | \n", - "2023-07-24 | \n", - "https://doi.org/10.32920/23739360.v1 | \n", - "4 | \n", - "John Shields, Zainab Abu Alrob | \n", - "submittedVersion - https://doi.org/10.32920/23... | \n", - "2 | \n", - "
3 | \n", - "https://openalex.org/W4307231237 | \n", - "Higher risk of short term COVID-19 vaccine adv... | \n", - "2023-05-02 | \n", - "https://doi.org/10.1093/rheumatology/keac603 | \n", - "3 | \n", - "Mrinalini Dey, R Naveen, Elena Nikiphorou, Par... | \n", - "submittedVersion: National Institutes of Healt... | \n", - "1 | \n", - "
4 | \n", - "https://openalex.org/W4319655309 | \n", - "Navigating the Post-COVID Market: A Prospectiv... | \n", - "2023-02-09 | \n", - "https://doi.org/10.5281/zenodo.7625190 | \n", - "3 | \n", - "Peng Sun, Xiaode Zuo | \n", - "submittedVersion: European Organization for Nu... | \n", - "1 | \n", - "