Skip to content

Commit

Permalink
eclipse-rdf4jGH-5058: additional parser code (WIP)
Browse files Browse the repository at this point in the history
  • Loading branch information
barthanssens committed Jul 15, 2024
1 parent d91fb36 commit ed1e748
Show file tree
Hide file tree
Showing 2 changed files with 189 additions and 143 deletions.
152 changes: 9 additions & 143 deletions core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/CSVWParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ public synchronized void parse(InputStream in, String baseURI)
Resource tableSchema = getTableSchema(metadata, (Resource) table);
List<Value> columns = getColumns(metadata, tableSchema);
CellParser[] cellParsers = columns.stream()
.map(c -> getCellParser(metadata, (Resource) c))
.map(c -> CSVWUtil.getCellParser(metadata, (Resource) c))
.collect(Collectors.toList())
.toArray(new CellParser[columns.size()]);

Expand Down Expand Up @@ -218,96 +218,6 @@ private List<Value> getColumns(Model metadata, Resource tableSchema) throws RDFP
return RDFCollections.asValues(metadata, head.get(), new ArrayList<>());
}

/**
* Get parser for specific column
*
* @param metadata
* @param column
* @return
*/
private CellParser getCellParser(Model metadata, Resource column) {
IRI datatype = getDatatypeIRI(metadata, column);

CellParser parser = CellParserFactory.create(datatype);

Models.getPropertyString(metadata, column, CSVW.NAME)
.ifPresentOrElse(v -> parser.setName(v),
() -> new RDFParseException("Metadata file does not contain name for column " + column));

Models.getPropertyString(metadata, column, CSVW.DEFAULT).ifPresent(v -> parser.setDefaultValue(v));
Models.getPropertyString(metadata, column, CSVW.REQUIRED)
.ifPresent(v -> parser.setRequired(Boolean.parseBoolean(v)));
Models.getPropertyString(metadata, column, CSVW.VIRTUAL)
.ifPresent(v -> parser.setVirtual(Boolean.parseBoolean(v)));
Models.getPropertyString(metadata, column, CSVW.SUPPRESS_OUTPUT)
.ifPresent(v -> parser.setVirtual(Boolean.parseBoolean(v)));

// only useful for strings
Models.getPropertyString(metadata, column, CSVW.LANG).ifPresent(v -> parser.setLang(v));

// only useful for numeric
Models.getPropertyString(metadata, column, CSVW.DECIMAL_CHAR).ifPresent(v -> parser.setDecimalChar(v));
Models.getPropertyString(metadata, column, CSVW.GROUP_CHAR).ifPresent(v -> parser.setGroupChar(v));

// mostly for date formats
getFormat(metadata, column).ifPresent(v -> parser.setFormat(v));

Models.getPropertyString(metadata, column, CSVW.TRIM)
.ifPresent(v -> parser.setVirtual(Boolean.parseBoolean(v)));

Models.getPropertyString(metadata, column, CSVW.VALUE_URL).ifPresent(v -> parser.setValueUrl(v));

// use a property from a vocabulary as predicate, or create a property relative to the namespace of the CSV
Optional<String> propertyURL = Models.getPropertyString(metadata, column, CSVW.PROPERTY_URL);
String s = propertyURL.isPresent() ? propertyURL.get() : "_local:" + parser.getName();
parser.setPropertyIRI(metadata.getNamespaces(), s);

return parser;
}

/**
* Get name of base or derived datatype
*
* @param metadata
* @param column
* @return
*/
private IRI getDatatypeIRI(Model metadata, Resource column) {
Optional<Value> val = Models.getProperty(metadata, column, CSVW.DATATYPE);
if (val.isPresent()) {
Value datatype = val.get();
// derived datatype
if (datatype.isBNode()) {
val = Models.getProperty(metadata, (Resource) datatype, CSVW.BASE);
}
}
if (!val.isPresent()) {
return XSD.STRING.getIri();
}
Value datatype = val.get();
if (datatype.isIRI()) {
return (IRI) datatype;
}
return XSD.valueOf(datatype.stringValue().toUpperCase()).getIri();
}

/**
* Get format string, e.g date format
*
* @param metadata
* @param column
* @return
*/
private Optional<String> getFormat(Model metadata, Resource column) {
Optional<Value> val = Models.getProperty(metadata, column, CSVW.DATATYPE);
if (val.isPresent() && val.get().isBNode()) {
val = Models.getProperty(metadata, (Resource) val.get(), CSVW.FORMAT);
if (val.isPresent() && val.get().isLiteral()) {
return Optional.of(val.get().stringValue());
}
}
return Optional.empty();
}

/**
* Get "about" URL template, to be used to create the subject of the triples
Expand Down Expand Up @@ -414,7 +324,7 @@ private void parseCSV(Model metadata, RDFHandler handler, URI csvFile, CellParse

String aboutURL = getAboutURL(metadata, table);

Charset encoding = getEncoding(metadata, table);
Charset encoding = CSVWUtil.getEncoding(metadata, table);
boolean minimal = getParserConfig().get(CSVWParserSettings.MINIMAL_MODE);

// check for placeholder / column name that's being used to create subject IRI
Expand All @@ -435,7 +345,7 @@ private void parseCSV(Model metadata, RDFHandler handler, URI csvFile, CellParse
long line = 1;
try (InputStream is = csvFile.toURL().openStream();
BufferedReader reader = new BufferedReader(new InputStreamReader(is, encoding));
CSVReader csv = getCSVReader(metadata, table, reader)) {
CSVReader csv = CSVWUtil.getCSVReader(metadata, table, reader)) {

Map<String, String> values = null;
String[] cells;
Expand All @@ -451,13 +361,15 @@ private void parseCSV(Model metadata, RDFHandler handler, URI csvFile, CellParse

// csv cells
for (int i = 0; i < cells.length; i++) {
if (doReplace) {
values.put("{_col}", Long.toString(i));
}
if (i == aboutIndex) { // already processed to get subject
if (doReplace) {
values.put(cellParsers[i].getNameEncoded(), cellParsers[i].parse(cells[i]).stringValue());
}
continue;
}

Value val = cellParsers[i].parse(cells[i]);
if (doReplace) {
values.put(cellParsers[i].getNameEncoded(), val.stringValue());
Expand All @@ -477,6 +389,9 @@ private void parseCSV(Model metadata, RDFHandler handler, URI csvFile, CellParse
}
// virtual columns, if any
for (int i = cells.length; i < cellParsers.length; i++) {
if (doReplace) {
values.put("{_col}", Long.toString(i));
}
handleStatement(handler, cellParsers[i], null, aboutSubject, values);
}
line++;
Expand Down Expand Up @@ -529,55 +444,6 @@ private void handleStatement(RDFHandler handler, CellParser cellParser, String c
handler.handleStatement(stmt);
}

/**
* Get configured CSV file reader
*
* @param metadata
* @param reader
* @return
*/
private CSVReader getCSVReader(Model metadata, Resource table, Reader reader) {
CSVParserBuilder parserBuilder = new CSVParserBuilder();
CSVReaderBuilder builder = new CSVReaderBuilder(reader);
builder.withSkipLines(1);

Optional<Value> val = Models.getProperty(metadata, table, CSVW.DIALECT);
if (val.isPresent()) {
Resource dialect = (Resource) val.get();

// skip header (and possibly other) rows
String headerRows = Models.getPropertyString(metadata, dialect, CSVW.HEADER_ROW_COUNT).orElse("1");
String skipRows = Models.getPropertyString(metadata, dialect, CSVW.SKIP_ROWS).orElse("0");
int skip = Integer.valueOf(headerRows) + Integer.valueOf(skipRows);
Models.getPropertyString(metadata, dialect, CSVW.HEADER)
.ifPresent(v -> builder.withSkipLines(v.equalsIgnoreCase("false") ? 0 : skip));

Models.getPropertyString(metadata, dialect, CSVW.DELIMITER)
.ifPresent(v -> parserBuilder.withSeparator(v.charAt(0)));
Models.getPropertyString(metadata, dialect, CSVW.QUOTE_CHAR)
.ifPresent(v -> parserBuilder.withQuoteChar(v.charAt(0)));
}
return builder.withCSVParser(parserBuilder.build()).build();
}

/**
* Get charset of the CSV, by default this should be UTF-8
*
* @param metadata
* @param table
* @return charset
*/
private Charset getEncoding(Model metadata, Resource table) {
Optional<Value> dialect = Models.getProperty(metadata, table, CSVW.DIALECT);
if (dialect.isPresent()) {
Optional<String> encoding = Models.getPropertyString(metadata, (Resource) dialect.get(), CSVW.ENCODING);
if (encoding.isPresent()) {
return Charset.forName(encoding.get());
}
}
return StandardCharsets.UTF_8;
}

/**
* Get subject IRI or blank node
*
Expand Down
180 changes: 180 additions & 0 deletions core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/CSVWUtil.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
/*******************************************************************************
* Copyright (c) 2024 Eclipse RDF4J contributors.
*
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the Eclipse Distribution License v1.0
* which accompanies this distribution, and is available at
* http://www.eclipse.org/org/documents/edl-v10.php.
*
* SPDX-License-Identifier: BSD-3-Clause
*******************************************************************************/
package org.eclipse.rdf4j.rio.csvw;

import com.opencsv.CSVParserBuilder;
import com.opencsv.CSVReader;
import com.opencsv.CSVReaderBuilder;
import java.io.Reader;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.Optional;
import org.eclipse.rdf4j.model.IRI;
import org.eclipse.rdf4j.model.Model;
import org.eclipse.rdf4j.model.Resource;
import org.eclipse.rdf4j.model.Value;
import org.eclipse.rdf4j.model.base.CoreDatatype;
import org.eclipse.rdf4j.model.util.Models;
import org.eclipse.rdf4j.model.vocabulary.CSVW;
import org.eclipse.rdf4j.rio.RDFParseException;
import org.eclipse.rdf4j.rio.csvw.parsers.CellParser;
import org.eclipse.rdf4j.rio.csvw.parsers.CellParserFactory;


/**
* Utility class, mostly about configuring the reader based on the JSON-LD metadata
*
* @author Bart Hanssens
*/
public class CSVWUtil {

/**
* Get configured CSV file reader
*
* @param metadata
* @param table
* @param reader
* @return
*/
protected static CSVReader getCSVReader(Model metadata, Resource table, Reader reader) {
CSVParserBuilder parserBuilder = new CSVParserBuilder();
CSVReaderBuilder builder = new CSVReaderBuilder(reader);
builder.withSkipLines(1);

Optional<Value> val = Models.getProperty(metadata, table, CSVW.DIALECT);
if (val.isPresent()) {
Resource dialect = (Resource) val.get();

// skip header (and possibly other) rows
String headerRows = Models.getPropertyString(metadata, dialect, CSVW.HEADER_ROW_COUNT).orElse("1");
String skipRows = Models.getPropertyString(metadata, dialect, CSVW.SKIP_ROWS).orElse("0");
int skip = Integer.valueOf(headerRows) + Integer.valueOf(skipRows);
Models.getPropertyString(metadata, dialect, CSVW.HEADER)
.ifPresent(v -> builder.withSkipLines(v.equalsIgnoreCase("false") ? 0 : skip));

Models.getPropertyString(metadata, dialect, CSVW.DELIMITER)
.ifPresent(v -> parserBuilder.withSeparator(v.charAt(0)));
Models.getPropertyString(metadata, dialect, CSVW.QUOTE_CHAR)
.ifPresent(v -> parserBuilder.withQuoteChar(v.charAt(0)));
}
return builder.withCSVParser(parserBuilder.build()).build();
}

/**
* Get charset of the CSV, by default this should be UTF-8
*
* @param metadata
* @param table
* @return charset
*/
protected static Charset getEncoding(Model metadata, Resource table) {
Optional<Value> dialect = Models.getProperty(metadata, table, CSVW.DIALECT);
if (dialect.isPresent()) {
Optional<String> encoding = Models.getPropertyString(metadata, (Resource) dialect.get(), CSVW.ENCODING);
if (encoding.isPresent()) {
return Charset.forName(encoding.get());
}
}
return StandardCharsets.UTF_8;
}

/**
* Get name of base or derived datatype
*
* @param metadata
* @param column
* @return
*/
private static IRI getDatatypeIRI(Model metadata, Resource column) {
Optional<Value> val = Models.getProperty(metadata, column, CSVW.DATATYPE);
if (val.isPresent()) {
Value datatype = val.get();
// derived datatype
if (datatype.isBNode()) {
val = Models.getProperty(metadata, (Resource) datatype, CSVW.BASE);
}
}
if (!val.isPresent()) {
return CoreDatatype.XSD.STRING.getIri();
}
Value datatype = val.get();
if (datatype.isIRI()) {
return (IRI) datatype;
}
return CoreDatatype.XSD.valueOf(datatype.stringValue().toUpperCase()).getIri();
}


/**
* Get format string, e.g date format
*
* @param metadata
* @param column
* @return
*/
private static Optional<String> getFormat(Model metadata, Resource column) {
Optional<Value> val = Models.getProperty(metadata, column, CSVW.DATATYPE);
if (val.isPresent() && val.get().isBNode()) {
val = Models.getProperty(metadata, (Resource) val.get(), CSVW.FORMAT);
if (val.isPresent() && val.get().isLiteral()) {
return Optional.of(val.get().stringValue());
}
}
return Optional.empty();
}

/**
* Get parser for specific column
*
* @param metadata
* @param column
* @return
*/
protected static CellParser getCellParser(Model metadata, Resource column) {
IRI datatype = getDatatypeIRI(metadata, column);

CellParser parser = CellParserFactory.create(datatype);

Models.getPropertyString(metadata, column, CSVW.NAME)
.ifPresentOrElse(v -> parser.setName(v),
() -> new RDFParseException("Metadata file does not contain name for column " + column));

Models.getPropertyString(metadata, column, CSVW.DEFAULT).ifPresent(v -> parser.setDefaultValue(v));
Models.getPropertyString(metadata, column, CSVW.REQUIRED)
.ifPresent(v -> parser.setRequired(Boolean.parseBoolean(v)));
Models.getPropertyString(metadata, column, CSVW.VIRTUAL)
.ifPresent(v -> parser.setVirtual(Boolean.parseBoolean(v)));
Models.getPropertyString(metadata, column, CSVW.SUPPRESS_OUTPUT)
.ifPresent(v -> parser.setVirtual(Boolean.parseBoolean(v)));

// only useful for strings
Models.getPropertyString(metadata, column, CSVW.LANG).ifPresent(v -> parser.setLang(v));

// only useful for numeric
Models.getPropertyString(metadata, column, CSVW.DECIMAL_CHAR).ifPresent(v -> parser.setDecimalChar(v));
Models.getPropertyString(metadata, column, CSVW.GROUP_CHAR).ifPresent(v -> parser.setGroupChar(v));

// mostly for date formats
getFormat(metadata, column).ifPresent(v -> parser.setFormat(v));

Models.getPropertyString(metadata, column, CSVW.TRIM)
.ifPresent(v -> parser.setVirtual(Boolean.parseBoolean(v)));

Models.getPropertyString(metadata, column, CSVW.VALUE_URL).ifPresent(v -> parser.setValueUrl(v));

// use a property from a vocabulary as predicate, or create a property relative to the namespace of the CSV
Optional<String> propertyURL = Models.getPropertyString(metadata, column, CSVW.PROPERTY_URL);
String s = propertyURL.isPresent() ? propertyURL.get() : "_local:" + parser.getName();
parser.setPropertyIRI(metadata.getNamespaces(), s);

return parser;
}
}

0 comments on commit ed1e748

Please sign in to comment.