Skip to content

Commit

Permalink
migrate github/zenodo registry to elton from globi-libs
Browse files Browse the repository at this point in the history
  • Loading branch information
Jorrit Poelen committed Dec 26, 2024
1 parent 64dd2ce commit 10d0301
Show file tree
Hide file tree
Showing 33 changed files with 80,263 additions and 1 deletion.
39 changes: 38 additions & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,30 @@
</dependency>
<dependency>
<groupId>org.eol</groupId>
<artifactId>eol-globi-registry</artifactId>
<artifactId>eol-globi-data-sources</artifactId>
<version>${globi.version}</version>
<exclusions>
<exclusion>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
</exclusion>
<exclusion>
<groupId>commons-logging</groupId>
<artifactId>commons-logging</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
</exclusion>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.eol</groupId>
<artifactId>eol-globi-resource-remote</artifactId>
<version>${globi.version}</version>
<exclusions>
<exclusion>
Expand Down Expand Up @@ -134,7 +157,21 @@
<artifactId>junit</artifactId>
<version>4.13.1</version>
<scope>test</scope>
<exclusions>
<exclusion>
<groupId>org.hamcrest</groupId>
<artifactId>hamcrest-core</artifactId>
</exclusion>
</exclusions>
</dependency>

<dependency>
<groupId>org.hamcrest</groupId>
<artifactId>hamcrest-all</artifactId>
<version>1.3</version>
<scope>test</scope>
</dependency>

<dependency>
<groupId>org.jbibtex</groupId>
<artifactId>jbibtex</artifactId>
Expand Down
27 changes: 27 additions & 0 deletions src/main/java/org/eol/globi/service/DatasetZenodo.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
package org.eol.globi.service;

import org.eol.globi.util.InputStreamFactory;
import org.eol.globi.util.ResourceServiceLocalAndRemote;
import org.globalbioticinteractions.dataset.CitationUtil;
import org.globalbioticinteractions.dataset.DatasetWithResourceMapping;
import org.globalbioticinteractions.doi.DOI;

import java.io.File;
import java.net.URI;

public class DatasetZenodo extends DatasetWithResourceMapping {
public DatasetZenodo(String namespace, URI zenodoGitHubArchives, InputStreamFactory inputStreamFactory, File cacheDir) {
super(namespace, zenodoGitHubArchives, new ResourceServiceLocalAndRemote(inputStreamFactory, cacheDir));
}

public DatasetZenodo(String namespace, ResourceService resourceService, URI latestArchive) {
super(namespace, latestArchive, resourceService);
}

@Override
public DOI getDOI() {
return CitationUtil.getDOI(this);
}


}
132 changes: 132 additions & 0 deletions src/main/java/org/eol/globi/service/GitHubUtil.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
package org.eol.globi.service;

import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.apache.commons.compress.utils.IOUtils;
import org.apache.commons.io.output.NullOutputStream;
import org.apache.commons.lang3.tuple.Pair;
import org.globalbioticinteractions.dataset.Dataset;
import org.globalbioticinteractions.dataset.DatasetImpl;
import org.globalbioticinteractions.util.GitClient;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
import java.net.URISyntaxException;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;

public class GitHubUtil {
private static final Logger LOG = LoggerFactory.getLogger(GitHubUtil.class);

private static String retrieveAsString(URI githubURI, ResourceService resourceService) throws URISyntaxException, IOException {
try (InputStream is = resourceService.retrieve(githubURI)) {
return org.apache.commons.io.IOUtils.toString(is, StandardCharsets.UTF_8);
}
}

public static URI getGitHubAPIEndpoint(String path, String query) throws URISyntaxException {
return new URI(
"https",
null,
"api.github.com",
-1,
path,
query,
null
);
}

private static boolean hasInteractionData(URI gloBIConfigURI, ResourceService resourceService) throws IOException {
try (InputStream is = resourceService.retrieve(gloBIConfigURI)) {
IOUtils.copy(is, NullOutputStream.NULL_OUTPUT_STREAM);
return true;
} catch (Throwable th) {
return false;
}
}

private static URI getGloBIConfigURI(String repoName, String globiFilename, String commitHash) {
return URI.create(getBaseUrl(repoName, commitHash) + "/" + globiFilename);
}

public static List<String> find(ResourceService resourceService) throws URISyntaxException, IOException {
List<Pair<String, String>> globiRepos = searchGitHubForCandidateRepositories(resourceService);

List<String> reposWithData = new ArrayList<>();
for (Pair<String, String> globiRepo : globiRepos) {
if (isGloBIRepository(globiRepo.getKey(), globiRepo.getValue(), resourceService)) {
reposWithData.add(globiRepo.getKey());
}
}
return reposWithData;
}

private static List<Pair<String, String>> searchGitHubForCandidateRepositories(ResourceService resourceService) throws URISyntaxException, IOException {
int page = 1;
int totalAvailable = 0;
List<Pair<String, String>> globiRepos = new ArrayList<>();
do {
LOG.info("searching for repositories that mention [globalbioticinteractions], page [" + page + "]...");
String query = "q=globalbioticinteractions+in:readme+fork:true" +
"&per_page=100" +
"&page=" + page;
String repositoriesThatMentionGloBI
= retrieveAsString(
getGitHubAPIEndpoint("/search/repositories", query),
resourceService
);
JsonNode jsonNode = new ObjectMapper().readTree(repositoriesThatMentionGloBI);
if (jsonNode.has("total_count")) {
totalAvailable = jsonNode.get("total_count").asInt();
}
if (jsonNode.has("items")) {
for (JsonNode item : jsonNode.get("items")) {
if (item.has("full_name")) {
String repoName = item.get("full_name").asText();
String branch = item.get("default_branch").asText();
globiRepos.add(Pair.of(repoName, branch));
}
}
}
page++;
}
while (globiRepos.size() < totalAvailable);
LOG.info("searching for repositories that mention [globalbioticinteractions] done.");
return globiRepos;
}

static boolean isGloBIRepository(String globiRepo, String commitSHA, ResourceService resourceService) throws IOException {
return hasInteractionData(getGloBIConfigURI(globiRepo, "globi.json", commitSHA), resourceService)
|| hasInteractionData(getGloBIConfigURI(globiRepo, "globi-dataset.jsonld", commitSHA), resourceService)
|| hasInteractionData(getGloBIConfigURI(globiRepo, "eml.xml", commitSHA), resourceService);
}

public static String lastCommitSHA(String repository, ResourceService resourceService) throws IOException {
return GitClient.getLastCommitSHA1("https://github.com/" + repository, resourceService);
}

private static String getBaseUrl(String repo, String lastCommitSHA) {
return "https://raw.githubusercontent.com/" + repo + "/" + lastCommitSHA;
}

public static String getBaseUrlLastCommit(String repo, ResourceService resourceService) throws IOException, URISyntaxException {
String lastCommitSHA = lastCommitSHA(repo, resourceService);
if (lastCommitSHA == null) {
throw new IOException("failed to import github repo [" + repo + "]: no commits found.");
}
return getBaseUrl(repo, lastCommitSHA);
}

public static Dataset getArchiveDataset(String namespace, String commitSha, ResourceService resourceService) {
return new DatasetImpl(
namespace,
resourceService,
URI.create("https://github.com/" + namespace + "/archive/" + commitSha + ".zip")
);
}

}
67 changes: 67 additions & 0 deletions src/main/java/org/eol/globi/service/HttpEntityProxy.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
package org.eol.globi.service;

import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.eol.globi.util.InputStreamFactory;

import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;

public class HttpEntityProxy implements HttpEntity {

private final HttpEntity entity;
private final InputStreamFactory inputStreamFactory;

public HttpEntityProxy(HttpEntity entity, InputStreamFactory inputStreamFactory) {
this.entity = entity;
this.inputStreamFactory = inputStreamFactory;
}

@Override
public boolean isRepeatable() {
return entity.isRepeatable();
}

@Override
public boolean isChunked() {
return entity.isChunked();
}

@Override
public long getContentLength() {
return entity.getContentLength();
}

@Override
public Header getContentType() {
return entity.getContentType();
}

@Override
public Header getContentEncoding() {
return entity.getContentEncoding();
}

@Override
public InputStream getContent() throws IOException, UnsupportedOperationException {
return inputStreamFactory.create(entity.getContent());
}

@Override
public void writeTo(OutputStream outputStream) throws IOException {
entity.writeTo(outputStream);
}

@Override
public boolean isStreaming() {
return entity.isStreaming();
}

@Override
public void consumeContent() throws IOException {
try (InputStream content = entity.getContent()) {
//
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
package org.eol.globi.service;

import org.apache.http.HttpEntity;
import org.apache.http.impl.client.BasicResponseHandler;
import org.apache.http.util.EntityUtils;
import org.eol.globi.util.InputStreamFactory;

import java.io.IOException;

public class ResponseHandlerWithInputStreamFactory extends BasicResponseHandler {
private final InputStreamFactory inputStreamFactory;

public ResponseHandlerWithInputStreamFactory(InputStreamFactory inputStreamFactory) {
this.inputStreamFactory = inputStreamFactory;
}

@Override
public String handleEntity(final HttpEntity entity) throws IOException {
HttpEntityProxy httpEntityProxy = new HttpEntityProxy(entity, inputStreamFactory);
return EntityUtils.toString(httpEntityProxy);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
package org.globalbioticinteractions.dataset;

import org.eol.globi.service.GitHubUtil;
import org.eol.globi.service.ResourceService;

import java.io.IOException;
import java.net.URISyntaxException;
import java.util.function.Consumer;

public abstract class DatasetRegistryGitHub implements DatasetRegistry {

private final ResourceService resourceService;

DatasetRegistryGitHub(ResourceService resourceService) {
this.resourceService = resourceService;
}

@Override
public Iterable<String> findNamespaces() throws DatasetRegistryException {
try {
return GitHubUtil.find(this.resourceService);
} catch (URISyntaxException | IOException e) {
throw new DatasetRegistryException(e);
}
}

@Override
public void findNamespaces(Consumer<String> namespaceConsumer) throws DatasetRegistryException {
for (String namespace : findNamespaces()) {
namespaceConsumer.accept(namespace);
}
}


public ResourceService getResourceService() {
return resourceService;
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
package org.globalbioticinteractions.dataset;

import org.eol.globi.service.GitHubUtil;
import org.eol.globi.service.ResourceService;
import org.eol.globi.util.ResourceServiceHTTP;

import java.io.IOException;

public class DatasetRegistryGitHubArchive extends DatasetRegistryGitHub {

public DatasetRegistryGitHubArchive(ResourceService resourceService) {
super(resourceService);
}

@Override
public Dataset datasetFor(String namespace) throws DatasetRegistryException {
try {
String commitSha = GitHubUtil.lastCommitSHA(namespace, getResourceService());
return GitHubUtil.getArchiveDataset(namespace, commitSha, getResourceService());
} catch (IOException e) {
throw new DatasetRegistryException(e);
}
}

}
Loading

0 comments on commit 10d0301

Please sign in to comment.