Skip to content

Commit

Permalink
eclipse-rdf4jGH-5058: additional parser code (WIP)
Browse files Browse the repository at this point in the history
  • Loading branch information
barthanssens committed Jul 15, 2024
1 parent f930705 commit 15e3321
Show file tree
Hide file tree
Showing 5 changed files with 144 additions and 67 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,9 @@ public class CSVW {
/** csvw:required */
public static final IRI REQUIRED;

/** csvw:row */
public static final IRI HAS_ROW;

/** csvw:rownum */
public static final IRI ROWNUM;

Expand Down Expand Up @@ -145,7 +148,7 @@ public class CSVW {
SCHEMA = Vocabularies.createIRI(NAMESPACE, "Schema");
TABLE = Vocabularies.createIRI(NAMESPACE, "Table");
TABLE_GROUP = Vocabularies.createIRI(NAMESPACE, "TableGroup");

ABOUT_URL = Vocabularies.createIRI(NAMESPACE, "aboutUrl");
BASE = Vocabularies.createIRI(NAMESPACE, "base");
COLUMN = Vocabularies.createIRI(NAMESPACE, "column");
Expand All @@ -166,6 +169,7 @@ public class CSVW {
PROPERTY_URL = Vocabularies.createIRI(NAMESPACE, "propertyUrl");
QUOTE_CHAR = Vocabularies.createIRI(NAMESPACE, "quoteChar");
REQUIRED = Vocabularies.createIRI(NAMESPACE, "required");
HAS_ROW = Vocabularies.createIRI(NAMESPACE, "row");
ROWNUM = Vocabularies.createIRI(NAMESPACE, "rownum");
SKIP_COLUMNS = Vocabularies.createIRI(NAMESPACE, "skipColumns");
SKIP_ROWS = Vocabularies.createIRI(NAMESPACE, "skipRows");
Expand Down
101 changes: 76 additions & 25 deletions core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/CSVWParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
import java.util.stream.Collectors;

import org.apache.commons.lang3.CharSet;
import org.eclipse.rdf4j.model.BNode;
import org.eclipse.rdf4j.model.IRI;
import org.eclipse.rdf4j.model.Model;
import org.eclipse.rdf4j.model.Resource;
Expand All @@ -37,6 +38,7 @@
import org.eclipse.rdf4j.model.util.Statements;
import org.eclipse.rdf4j.model.util.Values;
import org.eclipse.rdf4j.model.vocabulary.CSVW;
import org.eclipse.rdf4j.model.vocabulary.RDF;
import org.eclipse.rdf4j.rio.ParserConfig;
import org.eclipse.rdf4j.rio.RDFFormat;
import org.eclipse.rdf4j.rio.RDFHandler;
Expand Down Expand Up @@ -79,19 +81,23 @@ public synchronized void parse(InputStream in, String baseURI)

clear();

RDFHandler rdfHandler = getRDFHandler();

Model metadata = parseMetadata(in, null, baseURI);
if (metadata == null || metadata.isEmpty()) {
throw new RDFParseException("No metadata found");
}

RDFHandler rdfHandler = getRDFHandler();

boolean minimal = getParserConfig().get(CSVWParserSettings.MINIMAL_MODE);
Resource rootNode = minimal ? null : generateTablegroupNode(rdfHandler);

List<Value> tables = getTables(metadata);
for (Value table : tables) {
URI csvFile = getURL(metadata, (Resource) table, baseURI);
if (csvFile == null) {
throw new RDFParseException("Could not find URL");
}
Resource tableNode = minimal ? null : generateTableNode(rdfHandler, rootNode);
// add dummy namespace for resolving unspecified column names / predicates relative to CSV file
metadata.getNamespaces().add(new SimpleNamespace("_local", csvFile.toString() + "#"));

Expand All @@ -102,7 +108,7 @@ public synchronized void parse(InputStream in, String baseURI)
.collect(Collectors.toList())
.toArray(new CellParser[columns.size()]);

parseCSV(metadata, rdfHandler, csvFile, cellParsers, (Resource) table);
parseCSV(metadata, rdfHandler, csvFile, cellParsers, (Resource) table, tableNode);
}
clear();
}
Expand Down Expand Up @@ -137,25 +143,6 @@ private Model parseMetadata(InputStream in, Reader reader, String baseURI) throw
return metadata;
}

/**
* Get (the blank nodes of) the table(s)
*
* @param metadata
* @return
*/
private List<Value> getTables(Model metadata) throws RDFParseException {
Iterator<Statement> it = metadata.getStatements(null, CSVW.TABLES, null).iterator();
if (!it.hasNext()) {
// only one table, simplified structure
it = metadata.getStatements(null, CSVW.TABLE_SCHEMA, null).iterator();
if (!it.hasNext()) {
throw new RDFParseException("Metadata file has no tables and no tableSschema");
}
return List.of(it.next().getSubject());
}
return RDFCollections.asValues(metadata, (Resource) it.next().getObject(), new ArrayList<>());
}

/**
* Get the location of the CSV file
*
Expand Down Expand Up @@ -191,6 +178,25 @@ private Resource getTableSchema(Model metadata, Resource table) throws RDFParseE
.orElseThrow(() -> new RDFParseException("Metadata file does not contain tableSchema for " + table));
}

/**
* Get (the blank nodes of) the table(s)
*
* @param metadata
* @return
*/
private List<Value> getTables(Model metadata) throws RDFParseException {
Iterator<Statement> it = metadata.getStatements(null, CSVW.TABLES, null).iterator();
if (!it.hasNext()) {
// only one table, simplified structure
it = metadata.getStatements(null, CSVW.TABLE_SCHEMA, null).iterator();
if (!it.hasNext()) {
throw new RDFParseException("Metadata file has no tables and no tableSschema");
}
return List.of(it.next().getSubject());
}
return RDFCollections.asValues(metadata, (Resource) it.next().getObject(), new ArrayList<>());
}

/**
* Get the (blank nodes of the) columns for a given tableschema
*
Expand Down Expand Up @@ -279,7 +285,7 @@ private IRI getDatatypeIRI(Model metadata, Resource column) {
}

/**
* Get format string
* Get format string, e.g date format
*
* @param metadata
* @param column
Expand Down Expand Up @@ -331,6 +337,46 @@ private int getAboutIndex(String aboutURL, CellParser[] cellParsers) {
return -1;
}

/**
* Return root node for standard mode
*
* @param handler
* @return
*/
private Resource generateTablegroupNode(RDFHandler handler) {
BNode node = Values.bnode();
handler.handleStatement(Statements.statement(node, RDF.TYPE, CSVW.TABLE_GROUP, null));
return node;
}

/**
* Return root node for standard mode
*
* @param handler
* @return
*/
private Resource generateTableNode(RDFHandler handler, Resource rootNode) {
BNode node = Values.bnode();
handler.handleStatement(Statements.statement(rootNode, CSVW.TABLES, node, null));
handler.handleStatement(Statements.statement(node, RDF.TYPE, CSVW.TABLE, null));
return node;
}

/**
* Return root node for standard mode
*
* @param handler
* @return
*/
private Resource generateRowNode(RDFHandler handler, Resource tableNode, Resource subject, long rownum) {
BNode node = Values.bnode();
handler.handleStatement(Statements.statement(tableNode, CSVW.HAS_ROW, node, null));
handler.handleStatement(Statements.statement(node, RDF.TYPE, CSVW.ROW, null));
handler.handleStatement(Statements.statement(node, CSVW.ROWNUM, Values.literal(rownum), null));
handler.handleStatement(Statements.statement(node, CSVW.DESCRIBES, subject, null));
return node;
}

/**
* Parse a CSV file
*
Expand All @@ -339,7 +385,8 @@ private int getAboutIndex(String aboutURL, CellParser[] cellParsers) {
* @param aboutURL
* @param aboutIndex
*/
private void parseCSV(Model metadata, RDFHandler handler, URI csvFile, CellParser[] cellParsers, Resource table) {
private void parseCSV(Model metadata, RDFHandler handler, URI csvFile, CellParser[] cellParsers, Resource table,
Resource tableNode) {
String aboutURL = getAboutURL(metadata, table);

// check for placeholder / column name that's being used to create subject IRI
Expand All @@ -349,21 +396,25 @@ private void parseCSV(Model metadata, RDFHandler handler, URI csvFile, CellParse
LOGGER.info("Parsing {}", csvFile);

Charset encoding = getEncoding(metadata, table);
boolean minimal = getParserConfig().get(CSVWParserSettings.MINIMAL_MODE);

long line = 0;
long line = 1;
try (InputStream is = csvFile.toURL().openStream();
BufferedReader reader = new BufferedReader(new InputStreamReader(is, encoding));
CSVReader csv = getCSVReader(metadata, table, reader)) {

String[] cells;
while ((cells = csv.readNext()) != null) {
Resource subject = getIRIorBnode(cellParsers, cells, aboutURL, aboutIndex, placeholder);
Resource rowNode = minimal ? null : generateRowNode(rdfHandler, tableNode, subject, line);

Value val;
Statement stmt;
for (int i = 0; i < cells.length; i++) {
if (i == aboutIndex) { // already processed to get subject
continue;
}

IRI predicate = cellParsers[i].getPropertyIRI();
val = cellParsers[i].parse(cells[i]);
handler.handleStatement(Statements.statement(subject, predicate, val, null));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,17 @@ public class CSVWParserSettings {
public static final BooleanRioSetting FAIL_ON_INVALID_LINES = new BooleanRioSetting(
"org.eclipse.rdf4j.rio.csvw.fail_on_invalid_lines", "Fail on CSVW invalid lines", Boolean.TRUE);

/**
* Boolean setting for parser to determine whether 'minimal mode' is to be used. I.e. only produce triples from the
* data cells, without adding table metadata .
* <p>
* Defaults to false.
* <p>
* Can be overridden by setting system property {@code org.eclipse.rdf4j.rio.csvw.minimal_mode}
*/
public static final BooleanRioSetting MINIMAL_MODE = new BooleanRioSetting(
"org.eclipse.rdf4j.rio.csvw.minimal_mode", "", Boolean.FALSE);

/**
* Private constructor
*/
Expand Down
Loading

0 comments on commit 15e3321

Please sign in to comment.