-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
migrate github/zenodo registry to elton from globi-libs
- Loading branch information
Jorrit Poelen
committed
Dec 26, 2024
1 parent
64dd2ce
commit 10d0301
Showing
33 changed files
with
80,263 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
package org.eol.globi.service; | ||
|
||
import org.eol.globi.util.InputStreamFactory; | ||
import org.eol.globi.util.ResourceServiceLocalAndRemote; | ||
import org.globalbioticinteractions.dataset.CitationUtil; | ||
import org.globalbioticinteractions.dataset.DatasetWithResourceMapping; | ||
import org.globalbioticinteractions.doi.DOI; | ||
|
||
import java.io.File; | ||
import java.net.URI; | ||
|
||
public class DatasetZenodo extends DatasetWithResourceMapping { | ||
public DatasetZenodo(String namespace, URI zenodoGitHubArchives, InputStreamFactory inputStreamFactory, File cacheDir) { | ||
super(namespace, zenodoGitHubArchives, new ResourceServiceLocalAndRemote(inputStreamFactory, cacheDir)); | ||
} | ||
|
||
public DatasetZenodo(String namespace, ResourceService resourceService, URI latestArchive) { | ||
super(namespace, latestArchive, resourceService); | ||
} | ||
|
||
@Override | ||
public DOI getDOI() { | ||
return CitationUtil.getDOI(this); | ||
} | ||
|
||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,132 @@ | ||
package org.eol.globi.service; | ||
|
||
import com.fasterxml.jackson.databind.JsonNode; | ||
import com.fasterxml.jackson.databind.ObjectMapper; | ||
import org.apache.commons.compress.utils.IOUtils; | ||
import org.apache.commons.io.output.NullOutputStream; | ||
import org.apache.commons.lang3.tuple.Pair; | ||
import org.globalbioticinteractions.dataset.Dataset; | ||
import org.globalbioticinteractions.dataset.DatasetImpl; | ||
import org.globalbioticinteractions.util.GitClient; | ||
import org.slf4j.Logger; | ||
import org.slf4j.LoggerFactory; | ||
|
||
import java.io.IOException; | ||
import java.io.InputStream; | ||
import java.net.URI; | ||
import java.net.URISyntaxException; | ||
import java.nio.charset.StandardCharsets; | ||
import java.util.ArrayList; | ||
import java.util.List; | ||
|
||
public class GitHubUtil { | ||
private static final Logger LOG = LoggerFactory.getLogger(GitHubUtil.class); | ||
|
||
private static String retrieveAsString(URI githubURI, ResourceService resourceService) throws URISyntaxException, IOException { | ||
try (InputStream is = resourceService.retrieve(githubURI)) { | ||
return org.apache.commons.io.IOUtils.toString(is, StandardCharsets.UTF_8); | ||
} | ||
} | ||
|
||
public static URI getGitHubAPIEndpoint(String path, String query) throws URISyntaxException { | ||
return new URI( | ||
"https", | ||
null, | ||
"api.github.com", | ||
-1, | ||
path, | ||
query, | ||
null | ||
); | ||
} | ||
|
||
private static boolean hasInteractionData(URI gloBIConfigURI, ResourceService resourceService) throws IOException { | ||
try (InputStream is = resourceService.retrieve(gloBIConfigURI)) { | ||
IOUtils.copy(is, NullOutputStream.NULL_OUTPUT_STREAM); | ||
return true; | ||
} catch (Throwable th) { | ||
return false; | ||
} | ||
} | ||
|
||
private static URI getGloBIConfigURI(String repoName, String globiFilename, String commitHash) { | ||
return URI.create(getBaseUrl(repoName, commitHash) + "/" + globiFilename); | ||
} | ||
|
||
public static List<String> find(ResourceService resourceService) throws URISyntaxException, IOException { | ||
List<Pair<String, String>> globiRepos = searchGitHubForCandidateRepositories(resourceService); | ||
|
||
List<String> reposWithData = new ArrayList<>(); | ||
for (Pair<String, String> globiRepo : globiRepos) { | ||
if (isGloBIRepository(globiRepo.getKey(), globiRepo.getValue(), resourceService)) { | ||
reposWithData.add(globiRepo.getKey()); | ||
} | ||
} | ||
return reposWithData; | ||
} | ||
|
||
private static List<Pair<String, String>> searchGitHubForCandidateRepositories(ResourceService resourceService) throws URISyntaxException, IOException { | ||
int page = 1; | ||
int totalAvailable = 0; | ||
List<Pair<String, String>> globiRepos = new ArrayList<>(); | ||
do { | ||
LOG.info("searching for repositories that mention [globalbioticinteractions], page [" + page + "]..."); | ||
String query = "q=globalbioticinteractions+in:readme+fork:true" + | ||
"&per_page=100" + | ||
"&page=" + page; | ||
String repositoriesThatMentionGloBI | ||
= retrieveAsString( | ||
getGitHubAPIEndpoint("/search/repositories", query), | ||
resourceService | ||
); | ||
JsonNode jsonNode = new ObjectMapper().readTree(repositoriesThatMentionGloBI); | ||
if (jsonNode.has("total_count")) { | ||
totalAvailable = jsonNode.get("total_count").asInt(); | ||
} | ||
if (jsonNode.has("items")) { | ||
for (JsonNode item : jsonNode.get("items")) { | ||
if (item.has("full_name")) { | ||
String repoName = item.get("full_name").asText(); | ||
String branch = item.get("default_branch").asText(); | ||
globiRepos.add(Pair.of(repoName, branch)); | ||
} | ||
} | ||
} | ||
page++; | ||
} | ||
while (globiRepos.size() < totalAvailable); | ||
LOG.info("searching for repositories that mention [globalbioticinteractions] done."); | ||
return globiRepos; | ||
} | ||
|
||
static boolean isGloBIRepository(String globiRepo, String commitSHA, ResourceService resourceService) throws IOException { | ||
return hasInteractionData(getGloBIConfigURI(globiRepo, "globi.json", commitSHA), resourceService) | ||
|| hasInteractionData(getGloBIConfigURI(globiRepo, "globi-dataset.jsonld", commitSHA), resourceService) | ||
|| hasInteractionData(getGloBIConfigURI(globiRepo, "eml.xml", commitSHA), resourceService); | ||
} | ||
|
||
public static String lastCommitSHA(String repository, ResourceService resourceService) throws IOException { | ||
return GitClient.getLastCommitSHA1("https://github.com/" + repository, resourceService); | ||
} | ||
|
||
private static String getBaseUrl(String repo, String lastCommitSHA) { | ||
return "https://raw.githubusercontent.com/" + repo + "/" + lastCommitSHA; | ||
} | ||
|
||
public static String getBaseUrlLastCommit(String repo, ResourceService resourceService) throws IOException, URISyntaxException { | ||
String lastCommitSHA = lastCommitSHA(repo, resourceService); | ||
if (lastCommitSHA == null) { | ||
throw new IOException("failed to import github repo [" + repo + "]: no commits found."); | ||
} | ||
return getBaseUrl(repo, lastCommitSHA); | ||
} | ||
|
||
public static Dataset getArchiveDataset(String namespace, String commitSha, ResourceService resourceService) { | ||
return new DatasetImpl( | ||
namespace, | ||
resourceService, | ||
URI.create("https://github.com/" + namespace + "/archive/" + commitSha + ".zip") | ||
); | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
package org.eol.globi.service; | ||
|
||
import org.apache.http.Header; | ||
import org.apache.http.HttpEntity; | ||
import org.eol.globi.util.InputStreamFactory; | ||
|
||
import java.io.IOException; | ||
import java.io.InputStream; | ||
import java.io.OutputStream; | ||
|
||
public class HttpEntityProxy implements HttpEntity { | ||
|
||
private final HttpEntity entity; | ||
private final InputStreamFactory inputStreamFactory; | ||
|
||
public HttpEntityProxy(HttpEntity entity, InputStreamFactory inputStreamFactory) { | ||
this.entity = entity; | ||
this.inputStreamFactory = inputStreamFactory; | ||
} | ||
|
||
@Override | ||
public boolean isRepeatable() { | ||
return entity.isRepeatable(); | ||
} | ||
|
||
@Override | ||
public boolean isChunked() { | ||
return entity.isChunked(); | ||
} | ||
|
||
@Override | ||
public long getContentLength() { | ||
return entity.getContentLength(); | ||
} | ||
|
||
@Override | ||
public Header getContentType() { | ||
return entity.getContentType(); | ||
} | ||
|
||
@Override | ||
public Header getContentEncoding() { | ||
return entity.getContentEncoding(); | ||
} | ||
|
||
@Override | ||
public InputStream getContent() throws IOException, UnsupportedOperationException { | ||
return inputStreamFactory.create(entity.getContent()); | ||
} | ||
|
||
@Override | ||
public void writeTo(OutputStream outputStream) throws IOException { | ||
entity.writeTo(outputStream); | ||
} | ||
|
||
@Override | ||
public boolean isStreaming() { | ||
return entity.isStreaming(); | ||
} | ||
|
||
@Override | ||
public void consumeContent() throws IOException { | ||
try (InputStream content = entity.getContent()) { | ||
// | ||
} | ||
} | ||
} |
22 changes: 22 additions & 0 deletions
22
src/main/java/org/eol/globi/service/ResponseHandlerWithInputStreamFactory.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
package org.eol.globi.service; | ||
|
||
import org.apache.http.HttpEntity; | ||
import org.apache.http.impl.client.BasicResponseHandler; | ||
import org.apache.http.util.EntityUtils; | ||
import org.eol.globi.util.InputStreamFactory; | ||
|
||
import java.io.IOException; | ||
|
||
public class ResponseHandlerWithInputStreamFactory extends BasicResponseHandler { | ||
private final InputStreamFactory inputStreamFactory; | ||
|
||
public ResponseHandlerWithInputStreamFactory(InputStreamFactory inputStreamFactory) { | ||
this.inputStreamFactory = inputStreamFactory; | ||
} | ||
|
||
@Override | ||
public String handleEntity(final HttpEntity entity) throws IOException { | ||
HttpEntityProxy httpEntityProxy = new HttpEntityProxy(entity, inputStreamFactory); | ||
return EntityUtils.toString(httpEntityProxy); | ||
} | ||
} |
39 changes: 39 additions & 0 deletions
39
src/main/java/org/globalbioticinteractions/dataset/DatasetRegistryGitHub.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
package org.globalbioticinteractions.dataset; | ||
|
||
import org.eol.globi.service.GitHubUtil; | ||
import org.eol.globi.service.ResourceService; | ||
|
||
import java.io.IOException; | ||
import java.net.URISyntaxException; | ||
import java.util.function.Consumer; | ||
|
||
public abstract class DatasetRegistryGitHub implements DatasetRegistry { | ||
|
||
private final ResourceService resourceService; | ||
|
||
DatasetRegistryGitHub(ResourceService resourceService) { | ||
this.resourceService = resourceService; | ||
} | ||
|
||
@Override | ||
public Iterable<String> findNamespaces() throws DatasetRegistryException { | ||
try { | ||
return GitHubUtil.find(this.resourceService); | ||
} catch (URISyntaxException | IOException e) { | ||
throw new DatasetRegistryException(e); | ||
} | ||
} | ||
|
||
@Override | ||
public void findNamespaces(Consumer<String> namespaceConsumer) throws DatasetRegistryException { | ||
for (String namespace : findNamespaces()) { | ||
namespaceConsumer.accept(namespace); | ||
} | ||
} | ||
|
||
|
||
public ResourceService getResourceService() { | ||
return resourceService; | ||
} | ||
|
||
} |
25 changes: 25 additions & 0 deletions
25
src/main/java/org/globalbioticinteractions/dataset/DatasetRegistryGitHubArchive.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
package org.globalbioticinteractions.dataset; | ||
|
||
import org.eol.globi.service.GitHubUtil; | ||
import org.eol.globi.service.ResourceService; | ||
import org.eol.globi.util.ResourceServiceHTTP; | ||
|
||
import java.io.IOException; | ||
|
||
public class DatasetRegistryGitHubArchive extends DatasetRegistryGitHub { | ||
|
||
public DatasetRegistryGitHubArchive(ResourceService resourceService) { | ||
super(resourceService); | ||
} | ||
|
||
@Override | ||
public Dataset datasetFor(String namespace) throws DatasetRegistryException { | ||
try { | ||
String commitSha = GitHubUtil.lastCommitSHA(namespace, getResourceService()); | ||
return GitHubUtil.getArchiveDataset(namespace, commitSha, getResourceService()); | ||
} catch (IOException e) { | ||
throw new DatasetRegistryException(e); | ||
} | ||
} | ||
|
||
} |
Oops, something went wrong.