Skip to content

Commit

Permalink
eclipse-rdf4jGH-5058: additional parser code (WIP)
Browse files Browse the repository at this point in the history
  • Loading branch information
barthanssens committed Jul 15, 2024
1 parent b486a02 commit d91fb36
Show file tree
Hide file tree
Showing 2 changed files with 179 additions and 47 deletions.
126 changes: 102 additions & 24 deletions core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/CSVWParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,11 @@
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.stream.Collectors;

Expand All @@ -47,7 +50,6 @@
import org.eclipse.rdf4j.rio.csvw.parsers.CellParser;
import org.eclipse.rdf4j.rio.csvw.parsers.CellParserFactory;
import org.eclipse.rdf4j.rio.helpers.AbstractRDFParser;

import org.slf4j.LoggerFactory;

import com.opencsv.CSVParser;
Expand All @@ -61,13 +63,13 @@
*
* @author Bart Hanssens
*
* Basically it consists of an existing CSV file and a metadata file (in JSON-LD) describing the columns.
* Parsers need to convert the data client-side.
*
* Basically it consists of an existing CSV file and a metadata file (in JSON-LD) describing the columns.
* Parsers need to convert the data client-side.
*
* @see <a href="https://w3c.github.io/csvw/primer/">CSV on the Web Primer</a>
* @see <a href="https://w3c.github.io/csvw/syntax/">Model for Tabular Data and Metadata on the Web</a>
* @see <a href="https://w3c.github.io/csvw/metadata">Metadata Vocabulary for Tabular Data</a>
*
*
* @since 5.1.0
*/
public class CSVWParser extends AbstractRDFParser {
Expand Down Expand Up @@ -384,20 +386,20 @@ private Resource generateRowNode(RDFHandler handler, Resource tableNode, Resourc

/**
* Check which cellparsers have placeholders that need to be replaced
*
*
* @param cellParsers
* @return
* @return
*/
private boolean[] havePlaceholders(CellParser[] cellParsers) {
private boolean[] needReplacement(CellParser[] cellParsers) {
boolean[] placeholders = new boolean[cellParsers.length];

for (int i = 0; i < cellParsers.length; i++) {
placeholders[i] &= (cellParsers[i].getAboutPlaceholders() != null);
placeholders[i] &= (cellParsers[i].getValuePlaceholders() != null);
}
return null;
return placeholders;
}

/**
* Parse a CSV file
*
Expand All @@ -408,49 +410,125 @@ private boolean[] havePlaceholders(CellParser[] cellParsers) {
*/
private void parseCSV(Model metadata, RDFHandler handler, URI csvFile, CellParser[] cellParsers, Resource table,
Resource tableNode) {
LOGGER.info("Parsing {}", csvFile);

String aboutURL = getAboutURL(metadata, table);

Charset encoding = getEncoding(metadata, table);
boolean minimal = getParserConfig().get(CSVWParserSettings.MINIMAL_MODE);

// check for placeholder / column name that's being used to create subject IRI
int aboutIndex = getAboutIndex(aboutURL, cellParsers);
String placeholder = (aboutIndex > -1) ? "{" + cellParsers[aboutIndex].getName() + "}" : null;
String placeholder = (aboutIndex > -1) ? "{" + cellParsers[aboutIndex].getNameEncoded() + "}" : null;

boolean[] placeholders = havePlaceholders(cellParsers);
// check which columns need replacement in aboutURL/valueURL
boolean[] needReplacement = needReplacement(cellParsers);

LOGGER.info("Parsing {}", csvFile);

Charset encoding = getEncoding(metadata, table);
boolean minimal = getParserConfig().get(CSVWParserSettings.MINIMAL_MODE);
boolean doReplace = false;
for (int i = 0; i < needReplacement.length; i++) {
if (needReplacement[i]) {
doReplace = true;
return;
}
}

long line = 1;
try (InputStream is = csvFile.toURL().openStream();
BufferedReader reader = new BufferedReader(new InputStreamReader(is, encoding));
CSVReader csv = getCSVReader(metadata, table, reader)) {

Map<String, String> values = null;
String[] cells;

while ((cells = csv.readNext()) != null) {
Resource subject = getIRIorBnode(cellParsers, cells, aboutURL, aboutIndex, placeholder);
Resource rowNode = minimal ? null : generateRowNode(rdfHandler, tableNode, subject, line);
Resource aboutSubject = getIRIorBnode(cellParsers, cells, aboutURL, aboutIndex, placeholder);
Resource rowNode = minimal ? null : generateRowNode(rdfHandler, tableNode, aboutSubject, line);

Value val;
Statement stmt;
if (doReplace) {
values = new HashMap<>(cells.length + 4, 1.0f);
values.put("{_row}", Long.toString(line));
}

// csv cells
for (int i = 0; i < cells.length; i++) {
if (i == aboutIndex) { // already processed to get subject
if (doReplace) {
values.put(cellParsers[i].getNameEncoded(), cellParsers[i].parse(cells[i]).stringValue());
}
continue;
}

IRI predicate = cellParsers[i].getPropertyIRI();
val = cellParsers[i].parse(cells[i]);
Value val = cellParsers[i].parse(cells[i]);
if (doReplace) {
values.put(cellParsers[i].getNameEncoded(), val.stringValue());
}
if (!cellParsers[i].isSuppressed() && !needReplacement[i]) {
handleStatement(handler, cellParsers[i], cells[i], aboutSubject, val);
}
}
// second pass, this time to retrieve replace placeholders in URLs with column values
for (int i = 0; i < cells.length; i++) {
if (i == aboutIndex || !needReplacement[i]) { // already processed to get subject
continue;
}
if (!cellParsers[i].isSuppressed()) {
handler.handleStatement(Statements.statement(subject, predicate, val, null));
handleStatement(handler, cellParsers[i], cells[i], aboutSubject, values);
}
}
// virtual columns, if any
for (int i = cells.length; i < cellParsers.length; i++) {
handleStatement(handler, cellParsers[i], null, aboutSubject, values);
}
line++;
}
} catch (IOException | CsvValidationException ex) {
throw new RDFParseException("Error parsing " + csvFile, ex, line, -1);
}
}

/**
* Generate statement
*
* @param handler
* @param cellParser
* @param cells
* @param aboutSubject
*/
private void handleStatement(RDFHandler handler, CellParser cellParser, String cell, Resource aboutSubject,
Value val) {
Resource s = cellParser.getAboutUrl(cell);
IRI predicate = cellParser.getPropertyIRI();
Resource o = cellParser.getValueUrl(cell);

Statement stmt = Statements.statement((s != null) ? s : aboutSubject,
predicate,
(o != null) ? o : val,
null);
handler.handleStatement(stmt);
}

/**
* Generate statement
*
* @param handler
* @param cellParser
* @param cells
* @param aboutSubject
*/
private void handleStatement(RDFHandler handler, CellParser cellParser, String cell, Resource aboutSubject,
Map<String, String> values) {
Resource s = cellParser.getAboutUrl(cell);
IRI predicate = cellParser.getPropertyIRI();
Resource o = cellParser.getValueUrl(cell);
Value val = cellParser.parse(cell);

Statement stmt = Statements.statement((s != null) ? s : aboutSubject,
predicate,
(o != null) ? o : val,
null);
handler.handleStatement(stmt);
}

/**
* Get configured CSV file reader
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
Expand Down Expand Up @@ -51,42 +52,42 @@ public abstract class CellParser {

private String aboutPlaceholder;
private String[] aboutPlaceholders;

private String valuePlaceholder;
private String[] valuePlaceholders;

/**
* Get name of the column
*
* @return
*
* @return
*/
public String getName() {
return name;
}

/**
* Get URL encoded name
*
* @return encoded name
*
* @return encoded name
*/
public String getNameEncoded() {
return encodedName;
}

/**
* Set name of the column
*
* @param name
*
* @param name
*/
public void setName(String name) {
this.name = name;
this.encodedName = URLEncoder.encode(name, StandardCharsets.UTF_8);
}

/**
* Get datatype
*
* @return
*
* @return
*/
public IRI getDataType() {
return dataType;
Expand Down Expand Up @@ -166,7 +167,6 @@ public void setVirtual(boolean virtual) {
this.virtual = virtual;
}


/**
* Extract placeholder name for the own column, if any
*
Expand All @@ -192,12 +192,12 @@ private String getOwnPlaceholder(String template) {
private String[] getPlaceholders(String template) {
Matcher matcher = PLACEHOLDERS.matcher(template);
String ownPlaceholder = getOwnPlaceholder(template);

if (matcher.find()) {
Set<String> placeholders = matcher.results()
.map(m -> m.group())
.filter(m -> !m.equals(ownPlaceholder))
.collect(Collectors.toSet());
.map(m -> m.group())
.filter(m -> !m.equals(ownPlaceholder))
.collect(Collectors.toSet());
return placeholders.toArray(new String[placeholders.size()]);
}
return null;
Expand All @@ -206,10 +206,35 @@ private String[] getPlaceholders(String template) {
/**
* Get aboutURL
*
* @param cell
* @return
*/
public String getAboutUrl() {
return aboutUrl;
public IRI getAboutUrl(String cell) {
if (aboutUrl == null) {
return null;
}
String s = aboutUrl;
if (aboutPlaceholder != null && cell != null) {
s = aboutUrl.replace(encodedName, getValueOrDefault(cell));
}
return Values.iri(s);
}

/**
* Get aboutURL with placeholders replaced with values
*
* @param values
* @return
*/
public IRI getAboutUrl(Map<String, String> values) {
if (aboutUrl == null) {
return null;
}
String s = aboutUrl;
for (String val : aboutPlaceholders) {
s = aboutUrl.replace(val, values.get(val));
}
return Values.iri(s);
}

/**
Expand All @@ -226,8 +251,8 @@ public void setAboutUrl(String aboutUrl) {

/**
* Get about placeholders
*
* @return
*
* @return
*/
public String[] getAboutPlaceholders() {
return aboutPlaceholders;
Expand Down Expand Up @@ -264,10 +289,39 @@ public void setPropertyIRI(String propertyUrl) {
/**
* Get valueURL
*
* @param cell
* @return
*/
public String getValueUrl() {
return valueUrl;
public IRI getValueUrl(String cell) {
if (valueUrl == null) {
return null;
}
String s = valueUrl;
if (valuePlaceholder != null && cell != null) {
s = valueUrl.replace(encodedName, getValueOrDefault(cell));
}
return Values.iri(s);
}

/**
* Get valueURL with placeholders replaced with values
*
* @param values
* @param cell
* @return
*/
public IRI getValueUrl(Map<String, String> values, String cell) {
if (valueUrl == null) {
return null;
}
String s = valueUrl;
if (valuePlaceholder != null) {
s = valueUrl.replace(encodedName, getValueOrDefault(cell));
}
for (String val : valuePlaceholders) {
s = valueUrl.replace(val, values.get(val));
}
return Values.iri(s);
}

/**
Expand All @@ -283,8 +337,8 @@ public void setValueUrl(String valueUrl) {

/**
* Get value placeholders
*
* @return
*
* @return
*/
public String[] getValuePlaceholders() {
return valuePlaceholders;
Expand Down

0 comments on commit d91fb36

Please sign in to comment.