Skip to content

Commit

Permalink
eclipse-rdf4jGH-5058: additional parser code (WIP)
Browse files Browse the repository at this point in the history
  • Loading branch information
barthanssens committed Jul 11, 2024
1 parent a48f845 commit a56cb4b
Show file tree
Hide file tree
Showing 10 changed files with 248 additions and 60 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -51,15 +51,24 @@ public class CSVW {
/** csvw:datatype */
public static final IRI DATATYPE;

/** csvw:decimalChar */
public static final IRI DECIMAL_CHAR;

/** csvw:default */
public static final IRI DEFAULT;

/** csvw:delimiter */
public static final IRI DELIMITER;

/** csvw:dialect */
public static final IRI DIALECT;

/** csvw:format */
public static final IRI FORMAT;

/** csvw:groupChar */
public static final IRI GROUP_CHAR;

/** csvw:header */
public static final IRI HEADER;

Expand Down Expand Up @@ -90,14 +99,20 @@ public class CSVW {
/** csvw:valueUrl */
public static final IRI VALUE_URL;

/** csvw:virtual */
public static final IRI VIRTUAL;

static {
ABOUT_URL = Vocabularies.createIRI(NAMESPACE, "aboutUrl");
BASE = Vocabularies.createIRI(NAMESPACE, "base");
COLUMN = Vocabularies.createIRI(NAMESPACE, "column");
DATATYPE = Vocabularies.createIRI(NAMESPACE, "datatype");
DECIMAL_CHAR = Vocabularies.createIRI(NAMESPACE, "decimalChar");
DEFAULT = Vocabularies.createIRI(NAMESPACE, "default");
DELIMITER = Vocabularies.createIRI(NAMESPACE, "delimiter");
DIALECT = Vocabularies.createIRI(NAMESPACE, "dialect");
FORMAT = Vocabularies.createIRI(NAMESPACE, "format");
GROUP_CHAR = Vocabularies.createIRI(NAMESPACE, "groupChar");
HEADER = Vocabularies.createIRI(NAMESPACE, "header");
LANG = Vocabularies.createIRI(NAMESPACE, "lang");
NAME = Vocabularies.createIRI(NAMESPACE, "name");
Expand All @@ -108,5 +123,6 @@ public class CSVW {
TITLE = Vocabularies.createIRI(NAMESPACE, "title");
URL = Vocabularies.createIRI(NAMESPACE, "url");
VALUE_URL = Vocabularies.createIRI(NAMESPACE, "valueUrl");
VIRTUAL = Vocabularies.createIRI(NAMESPACE, "virtual");
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -214,20 +214,21 @@ private CellParser getCellParser(Model metadata, Resource column) {

CellParser parser = CellParserFactory.create(datatype);

Models.getPropertyString(metadata, column, CSVW.LANG).ifPresent(v -> parser.setLang(v));
getFormat(metadata, column).ifPresent(v -> parser.setFormat(v.stringValue()));

Models.getProperty(metadata, column, CSVW.NAME)
.ifPresentOrElse(v -> parser.setName(v.stringValue()),
() -> new RDFParseException("Metadata file does not contain name for column " + column));
Models.getProperty(metadata, column, CSVW.DEFAULT).ifPresent(v -> parser.setDefaultValue(v.stringValue()));
Models.getProperty(metadata, column, CSVW.REQUIRED)
.ifPresent(v -> parser.setIsRequired(Boolean.parseBoolean(v.stringValue())));
Models.getProperty(metadata, column, CSVW.VALUE_URL).ifPresent(v -> parser.setValueURL(v.stringValue()));
Models.getPropertyString(metadata, column, CSVW.NAME)
.ifPresentOrElse(v -> parser.setName(v,
() -> new RDFParseException("Metadata file does not contain name for column " + column));

Models.getPropertyString(metadata, column, CSVW.DEFAULT).ifPresent(v -> parser.setDefaultValue(v);
Models.getPropertyString(metadata, column, CSVW.REQUIRED)
.ifPresent(v -> parser.setIsRequired(Boolean.parseBoolean(v));
Models.getPropertyString(metadata, column, CSVW.VALUE_URL).ifPresent(v -> parser.setValueURL(v));

// use a property from a vocabulary as predicate, or create a property relative to the namespace of the CSV
Optional<Value> propertyURL = Models.getProperty(metadata, column, CSVW.PROPERTY_URL);
String s = propertyURL.isPresent() ? propertyURL.get().stringValue() : "_local:" + parser.getName();
Optional<String> propertyURL = Models.getPropertyString(metadata, column, CSVW.PROPERTY_URL);
String s = propertyURL.isPresent() ? propertyURL.get() : "_local:" + parser.getName();
parser.setPropertyURL(metadata.getNamespaces(), s);

return parser;
Expand Down Expand Up @@ -260,7 +261,7 @@ private IRI getDatatypeIRI(Model metadata, Resource column) {
}

/**
* Get IRI of base or derived datatype
* Get name of the generic datatype or more specific datatype
*
* @param metadata
* @param column
Expand All @@ -272,7 +273,8 @@ private Optional<Value> getFormat(Model metadata, Resource column) {
Value datatype = val.get();
// derived datatype
if (datatype.isBNode()) {
val = Models.getProperty(metadata, (Resource) datatype, CSVW.FORMAT);
Optional<Value> fmt = Models.getProperty(metadata, (Resource) datatype, CSVW.FORMAT);
val = Models.getProperty(metadata, (Resource) fmt.get(), CSVW.BASE);
}
}
return val;
Expand Down Expand Up @@ -327,10 +329,11 @@ private void parseCSV(Model metadata, RDFHandler handler, URI csvFile, CellParse
String placeholder = (aboutIndex > -1) ? cellParsers[aboutIndex].getName() : null;

LOGGER.info("Parsing {}", csvFile);

long line = 0;
try (InputStream is = csvFile.toURL().openStream();
BufferedReader reader = new BufferedReader(new InputStreamReader(is));
CSVReader csv = getCSVReader(metadata, reader)) {
CSVReader csv = getCSVReader(metadata, table, reader)) {

String[] cells;
while ((cells = csv.readNext()) != null) {
Expand Down Expand Up @@ -359,8 +362,15 @@ private void parseCSV(Model metadata, RDFHandler handler, URI csvFile, CellParse
* @param reader
* @return
*/
private CSVReader getCSVReader(Model metadata, Reader reader) {
private CSVReader getCSVReader(Model metadata, Resource table, Reader reader) {
CSVParser parser = new CSVParserBuilder().build();
CSVReaderBuilder builder = new CSVReaderBuilder(reader);

Optional<Value> dialect = Models.getProperty(metadata, table, CSVW.DIALECT);
if (dialect.isPresent()) {
Models.getPropertyString(metadata, (Resource) dialect, CSVW.DELIMITER);
}

return new CSVReaderBuilder(reader).withSkipLines(1).withCSVParser(parser).build();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,29 +10,29 @@
*******************************************************************************/
package org.eclipse.rdf4j.rio.csvw.parsers;

import java.time.format.DateTimeFormatter;
import java.util.Set;

import org.eclipse.rdf4j.model.IRI;
import org.eclipse.rdf4j.model.Namespace;
import org.eclipse.rdf4j.model.Value;
import org.eclipse.rdf4j.model.util.Literals;
import org.eclipse.rdf4j.model.util.Values;
import org.eclipse.rdf4j.rio.RDFParseException;

/**
*
* @author Bart Hanssens
*/
public class CellParser {
private String name;
public abstract class CellParser {
protected String name;
protected IRI dataType;
protected String lang;
protected String defaultValue;
private boolean isRequired;
private IRI propertyIRI;
private String valueUrl;
private String format;
private String separator;
protected boolean isRequired;
protected IRI propertyIRI;
protected String valueUrl;
protected String format;
protected String decimalChar;
protected String groupChar;
protected String separator;

/**
* @param name
Expand All @@ -55,6 +55,15 @@ public void setDataType(IRI dataType) {
this.dataType = dataType;
}

/**
* Set language code
*
* @param lang language code
*/
public void setLang(String lang) {
this.lang = lang;
}

/**
* @param defaultValue the defaultValue to set
*/
Expand Down Expand Up @@ -123,30 +132,55 @@ public void setSeparator(String separator) {
this.separator = separator;
}

/**
* @return the decimal character
*/
public String getDecimalChar() {
return decimalChar;
}

/**
* @param decimalChar the decimal character to set
*/
public void setDecimalChar(String decimalChar) {
this.decimalChar = decimalChar;
}


/**
* @return the group character
*/
public String getGroupChar() {
return groupChar;
}

/**
* @param groupChar the group character to set
*/
public void setGroupChar(String groupChar) {
this.groupChar = groupChar;
}

/**
* @param format
*/
public void setFormat(String format) {
this.format = format;
}

protected String getValueOrDefault(String s) {
if ((s == null || s.isEmpty()) && (defaultValue != null)) {
return defaultValue;
}
return s;
}

/**
* Get the value from a cell
*
* @param cell
* @return
*/
public Value parse(String cell) {
String s = cell;
if ((s == null || s.isEmpty()) && (defaultValue != null)) {
s = defaultValue;
}
if (valueUrl != null && s != null) {
return Values.iri(valueUrl.replace("{" + name + "}", s));
}
System.err.println(s);
System.err.println(dataType);
return Values.literal(s, dataType);
}
public abstract Value parse(String cell);

}
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,8 @@ public void setFormat(String format) {

@Override
public Value parse(String cell) {
String s = cell;
if ((s == null || s.isEmpty()) && (defaultValue != null)) {
s = defaultValue;
}
String s = getValueOrDefault(cell);

return Values.literal(valueTrue.equals(s) ? "true" : "false", dataType);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
package org.eclipse.rdf4j.rio.csvw.parsers;

import java.time.format.DateTimeFormatter;

import org.eclipse.rdf4j.model.Value;
import org.eclipse.rdf4j.model.util.Values;

Expand All @@ -32,10 +33,8 @@ public void setFormat(String format) {

@Override
public Value parse(String cell) {
String s = cell;
if ((s == null || s.isEmpty()) && (defaultValue != null)) {
s = defaultValue;
}
String s = getValueOrDefault(cell);

if (formatter != null) {
s = DateTimeFormatter.ISO_DATE.format(formatter.parse(s));
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
/*******************************************************************************
* Copyright (c) 2024 Eclipse RDF4J contributors.
*
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the Eclipse Distribution License v1.0
* which accompanies this distribution, and is available at
* http://www.eclipse.org/org/documents/edl-v10.php.
*
* SPDX-License-Identifier: BSD-3-Clause
*******************************************************************************/
package org.eclipse.rdf4j.rio.csvw.parsers;

import org.eclipse.rdf4j.model.Value;
import org.eclipse.rdf4j.model.util.Values;

/**
*
* @author Bart Hanssens
*/
public class CellParserDouble extends CellParser {

@Override
public Value parse(String cell) {
String s = getValueOrDefault(cell);

if (s != null && groupChar != null) {
s = s.replace(groupChar, "");
}

// always use a '.' in RDF, not the European-style ','
if (s != null && !decimalChar.equals(".")) {
s = s.replace(decimalChar, ".");
}

return Values.literal(s, dataType);
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -27,20 +27,31 @@ public class CellParserFactory {
public static CellParser create(IRI datatype) {
CellParser p;

XSD xsdType = XSD.valueOf(datatype.toString());
XSD xsdType = XSD.valueOf(datatype.getLocalName().toUpperCase());
if (xsdType == null) {
p = new CellParser();
p = new CellParserString();
} else {
switch(xsdType) {
case DATE:
case DATETIME:
p = new CellParserDate();
break;
switch (xsdType) {
case BOOLEAN:
p = new CellParserBoolean();
break;
case INTEGER:
case INT:
case SHORT:
case LONG:
p = new CellParserLong();
break;
case FLOAT:
case DOUBLE:
p = new CellParserDouble();
p.setDecimalChar(".");
break;
case DATE:
case DATETIME:
p = new CellParserDate();
break;
default:
p = new CellParser();
p = new CellParserString();
}
}
p.setDataType(datatype);
Expand Down
Loading

0 comments on commit a56cb4b

Please sign in to comment.