Skip to content

Commit

Permalink
Merge pull request #10865 from QualitativeDataRepository/IQSS/10379
Browse files Browse the repository at this point in the history
IQSS/10379 Fix Metrics API Bugs
  • Loading branch information
ofahimIQSS authored Oct 24, 2024
2 parents e2d1b6a + d383802 commit aebf31d
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 29 deletions.
10 changes: 10 additions & 0 deletions doc/release-notes/10379-MetricsBugsFixes.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@

### Metrics API Bug fixes

Two bugs in the Metrics API have been fixed:

- The /datasets and /datasets/byMonth endpoints could report incorrect values if/when they have been called using the dataLocation parameter (which allows getting metrics for local, remote (harvested), or all datasets) as the metrics cache was not storing different values for these cases.

- Metrics endpoints who's calculation relied on finding the latest published datasetversion were incorrect if/when the minor version number was > 9.

When deploying the new release, the [/api/admin/clearMetricsCache](https://guides.dataverse.org/en/latest/api/native-api.html#metrics) API should be called to remove old cached values that may be incorrect.
7 changes: 4 additions & 3 deletions src/main/java/edu/harvard/iq/dataverse/api/Metrics.java
Original file line number Diff line number Diff line change
Expand Up @@ -206,12 +206,13 @@ public Response getDatasetsTimeSeriest(@Context Request req, @Context UriInfo ur
return error(BAD_REQUEST, ia.getLocalizedMessage());
}
String metricName = "datasets";
JsonArray jsonArray = MetricsUtil.stringToJsonArray(metricsSvc.returnUnexpiredCacheAllTime(metricName, null, d));
String validDataLocation = MetricsUtil.validateDataLocationStringType(dataLocation);
JsonArray jsonArray = MetricsUtil.stringToJsonArray(metricsSvc.returnUnexpiredCacheAllTime(metricName, validDataLocation, d));

if (null == jsonArray) { // run query and save

jsonArray = metricsSvc.getDatasetsTimeSeries(uriInfo, dataLocation, d);
metricsSvc.save(new Metric(metricName, null, null, d, jsonArray.toString()));
jsonArray = metricsSvc.getDatasetsTimeSeries(uriInfo, validDataLocation, d);
metricsSvc.save(new Metric(metricName, null, validDataLocation, d, jsonArray.toString()));
}
MediaType requestedType = getVariant(req, MediaType.valueOf(FileUtil.MIME_TYPE_CSV), MediaType.APPLICATION_JSON_TYPE);
if ((requestedType != null) && (requestedType.equals(MediaType.APPLICATION_JSON_TYPE))) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -168,33 +168,20 @@ public long datasetsToMonth(String yyyymm, String dataLocation, Dataverse d) {
}
}

// Note that this SQL line in the code below:
// datasetversion.dataset_id || ':' || max(datasetversion.versionnumber + (.1 * datasetversion.minorversionnumber))
// behaves somewhat counter-intuitively if the versionnumber and/or
// minorversionnumber is/are NULL - it results in an empty string
// (NOT the string "{dataset_id}:", in other words). Some harvested
// versions do not have version numbers (only the ones harvested from
// other Dataverses!) It works fine
// for our purposes below, because we are simply counting the selected
// lines - i.e. we don't care if some of these lines are empty.
// But do not use this notation if you need the values returned to
// meaningfully identify the datasets!


Query query = em.createNativeQuery(


"select count(*)\n"
+ "from (\n"
+ "select datasetversion.dataset_id || ':' || max(datasetversion.versionnumber + (.1 * datasetversion.minorversionnumber))\n"
+ "select DISTINCT ON (datasetversion.dataset_id) datasetversion.dataset_id \n"
+ "from datasetversion\n"
+ "join dataset on dataset.id = datasetversion.dataset_id\n"
+ ((d == null) ? "" : "join dvobject on dvobject.id = dataset.id\n")
+ "where versionstate='RELEASED' \n"
+ ((d == null) ? "" : "and dvobject.owner_id in (" + getCommaSeparatedIdStringForSubtree(d, "Dataverse") + ")\n ")
+ "and \n"
+ dataLocationLine // be careful about adding more and statements after this line.
+ "group by dataset_id \n"
+ " order by datasetversion.dataset_id, datasetversion.versionnumber desc, datasetversion.minorversionnumber desc\n"
+") sub_temp"
);
logger.log(Level.FINE, "Metric query: {0}", query);
Expand All @@ -207,15 +194,15 @@ public List<Object[]> datasetsBySubjectToMonth(String yyyymm, String dataLocatio
// A published local datasets may have more than one released version!
// So that's why we have to jump through some extra hoops below
// in order to select the latest one:
String originClause = "(datasetversion.dataset_id || ':' || datasetversion.versionnumber + (.1 * datasetversion.minorversionnumber) in\n" +
String originClause = "(datasetversion.id in\n" +
"(\n" +
"select datasetversion.dataset_id || ':' || max(datasetversion.versionnumber + (.1 * datasetversion.minorversionnumber))\n" +
"select DISTINCT ON (datasetversion.dataset_id) datasetversion.id\n" +
" from datasetversion\n" +
" join dataset on dataset.id = datasetversion.dataset_id\n" +
" where versionstate='RELEASED'\n" +
" and dataset.harvestingclient_id is null\n" +
" and date_trunc('month', releasetime) <= to_date('" + yyyymm + "','YYYY-MM')\n" +
" group by dataset_id\n" +
" order by datasetversion.dataset_id, datasetversion.versionnumber desc, datasetversion.minorversionnumber desc\n" +
"))\n";

if (!DATA_LOCATION_LOCAL.equals(dataLocation)) { // Default api state is DATA_LOCATION_LOCAL
Expand Down Expand Up @@ -273,15 +260,15 @@ public long datasetsPastDays(int days, String dataLocation, Dataverse d) {
Query query = em.createNativeQuery(
"select count(*)\n"
+ "from (\n"
+ "select datasetversion.dataset_id || ':' || max(datasetversion.versionnumber + (.1 * datasetversion.minorversionnumber)) as max\n"
+ "select DISTINCT ON (datasetversion.dataset_id) datasetversion.id\n"
+ "from datasetversion\n"
+ "join dataset on dataset.id = datasetversion.dataset_id\n"
+ ((d == null) ? "" : "join dvobject on dvobject.id = dataset.id\n")
+ "where versionstate='RELEASED' \n"
+ ((d == null) ? "" : "and dvobject.owner_id in (" + getCommaSeparatedIdStringForSubtree(d, "Dataverse") + ")\n")
+ "and \n"
+ dataLocationLine // be careful about adding more and statements after this line.
+ "group by dataset_id \n"
+ " order by datasetversion.dataset_id, datasetversion.versionnumber desc, datasetversion.minorversionnumber desc \n"
+") sub_temp"
);
logger.log(Level.FINE, "Metric query: {0}", query);
Expand Down Expand Up @@ -322,17 +309,17 @@ public long filesToMonth(String yyyymm, Dataverse d) {
+ "select count(*)\n"
+ "from filemetadata\n"
+ "join datasetversion on datasetversion.id = filemetadata.datasetversion_id\n"
+ "where datasetversion.dataset_id || ':' || datasetversion.versionnumber + (.1 * datasetversion.minorversionnumber) in \n"
+ "where datasetversion.id in \n"
+ "(\n"
+ "select datasetversion.dataset_id || ':' || max(datasetversion.versionnumber + (.1 * datasetversion.minorversionnumber)) as max \n"
+ "select DISTINCT ON (datasetversion.dataset_id) datasetversion.id \n"
+ "from datasetversion\n"
+ "join dataset on dataset.id = datasetversion.dataset_id\n"
+ ((d == null) ? "" : "join dvobject on dvobject.id = dataset.id\n")
+ "where versionstate='RELEASED'\n"
+ ((d == null) ? "" : "and dvobject.owner_id in (" + getCommaSeparatedIdStringForSubtree(d, "Dataverse") + ")\n")
+ "and date_trunc('month', releasetime) <= to_date('" + yyyymm + "','YYYY-MM')\n"
+ "and dataset.harvestingclient_id is null\n"
+ "group by dataset_id \n"
+ "order by datasetversion.dataset_id, datasetversion.versionnumber desc, datasetversion.minorversionnumber \n"
+ ");"
);
logger.log(Level.FINE, "Metric query: {0}", query);
Expand All @@ -345,17 +332,17 @@ public long filesPastDays(int days, Dataverse d) {
+ "select count(*)\n"
+ "from filemetadata\n"
+ "join datasetversion on datasetversion.id = filemetadata.datasetversion_id\n"
+ "where datasetversion.dataset_id || ':' || datasetversion.versionnumber + (.1 * datasetversion.minorversionnumber) in \n"
+ "where datasetversion.id in \n"
+ "(\n"
+ "select datasetversion.dataset_id || ':' || max(datasetversion.versionnumber + (.1 * datasetversion.minorversionnumber)) as max \n"
+ "select DISTINCT ON (datasetversion.dataset_id) datasetversion.id \n"
+ "from datasetversion\n"
+ "join dataset on dataset.id = datasetversion.dataset_id\n"
+ ((d == null) ? "" : "join dvobject on dvobject.id = dataset.id\n")
+ "where versionstate='RELEASED'\n"
+ "and releasetime > current_date - interval '" + days + "' day\n"
+ ((d == null) ? "" : "AND dvobject.owner_id in (" + getCommaSeparatedIdStringForSubtree(d, "Dataverse") + ")\n")
+ "and dataset.harvestingclient_id is null\n"
+ "group by dataset_id \n"
+ "order by datasetversion.dataset_id, datasetversion.versionnumber desc, datasetversion.minorversionnumber desc \n"
+ ");"
);
logger.log(Level.FINE, "Metric query: {0}", query);
Expand Down

0 comments on commit aebf31d

Please sign in to comment.