forked from eclipse-rdf4j/rdf4j
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
eclipse-rdf4jGH-5058: additional parser code (WIP)
- Loading branch information
1 parent
d91fb36
commit ed1e748
Showing
2 changed files
with
189 additions
and
143 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
180 changes: 180 additions & 0 deletions
180
core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/CSVWUtil.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,180 @@ | ||
/******************************************************************************* | ||
* Copyright (c) 2024 Eclipse RDF4J contributors. | ||
* | ||
* All rights reserved. This program and the accompanying materials | ||
* are made available under the terms of the Eclipse Distribution License v1.0 | ||
* which accompanies this distribution, and is available at | ||
* http://www.eclipse.org/org/documents/edl-v10.php. | ||
* | ||
* SPDX-License-Identifier: BSD-3-Clause | ||
*******************************************************************************/ | ||
package org.eclipse.rdf4j.rio.csvw; | ||
|
||
import com.opencsv.CSVParserBuilder; | ||
import com.opencsv.CSVReader; | ||
import com.opencsv.CSVReaderBuilder; | ||
import java.io.Reader; | ||
import java.nio.charset.Charset; | ||
import java.nio.charset.StandardCharsets; | ||
import java.util.Optional; | ||
import org.eclipse.rdf4j.model.IRI; | ||
import org.eclipse.rdf4j.model.Model; | ||
import org.eclipse.rdf4j.model.Resource; | ||
import org.eclipse.rdf4j.model.Value; | ||
import org.eclipse.rdf4j.model.base.CoreDatatype; | ||
import org.eclipse.rdf4j.model.util.Models; | ||
import org.eclipse.rdf4j.model.vocabulary.CSVW; | ||
import org.eclipse.rdf4j.rio.RDFParseException; | ||
import org.eclipse.rdf4j.rio.csvw.parsers.CellParser; | ||
import org.eclipse.rdf4j.rio.csvw.parsers.CellParserFactory; | ||
|
||
|
||
/** | ||
* Utility class, mostly about configuring the reader based on the JSON-LD metadata | ||
* | ||
* @author Bart Hanssens | ||
*/ | ||
public class CSVWUtil { | ||
|
||
/** | ||
* Get configured CSV file reader | ||
* | ||
* @param metadata | ||
* @param table | ||
* @param reader | ||
* @return | ||
*/ | ||
protected static CSVReader getCSVReader(Model metadata, Resource table, Reader reader) { | ||
CSVParserBuilder parserBuilder = new CSVParserBuilder(); | ||
CSVReaderBuilder builder = new CSVReaderBuilder(reader); | ||
builder.withSkipLines(1); | ||
|
||
Optional<Value> val = Models.getProperty(metadata, table, CSVW.DIALECT); | ||
if (val.isPresent()) { | ||
Resource dialect = (Resource) val.get(); | ||
|
||
// skip header (and possibly other) rows | ||
String headerRows = Models.getPropertyString(metadata, dialect, CSVW.HEADER_ROW_COUNT).orElse("1"); | ||
String skipRows = Models.getPropertyString(metadata, dialect, CSVW.SKIP_ROWS).orElse("0"); | ||
int skip = Integer.valueOf(headerRows) + Integer.valueOf(skipRows); | ||
Models.getPropertyString(metadata, dialect, CSVW.HEADER) | ||
.ifPresent(v -> builder.withSkipLines(v.equalsIgnoreCase("false") ? 0 : skip)); | ||
|
||
Models.getPropertyString(metadata, dialect, CSVW.DELIMITER) | ||
.ifPresent(v -> parserBuilder.withSeparator(v.charAt(0))); | ||
Models.getPropertyString(metadata, dialect, CSVW.QUOTE_CHAR) | ||
.ifPresent(v -> parserBuilder.withQuoteChar(v.charAt(0))); | ||
} | ||
return builder.withCSVParser(parserBuilder.build()).build(); | ||
} | ||
|
||
/** | ||
* Get charset of the CSV, by default this should be UTF-8 | ||
* | ||
* @param metadata | ||
* @param table | ||
* @return charset | ||
*/ | ||
protected static Charset getEncoding(Model metadata, Resource table) { | ||
Optional<Value> dialect = Models.getProperty(metadata, table, CSVW.DIALECT); | ||
if (dialect.isPresent()) { | ||
Optional<String> encoding = Models.getPropertyString(metadata, (Resource) dialect.get(), CSVW.ENCODING); | ||
if (encoding.isPresent()) { | ||
return Charset.forName(encoding.get()); | ||
} | ||
} | ||
return StandardCharsets.UTF_8; | ||
} | ||
|
||
/** | ||
* Get name of base or derived datatype | ||
* | ||
* @param metadata | ||
* @param column | ||
* @return | ||
*/ | ||
private static IRI getDatatypeIRI(Model metadata, Resource column) { | ||
Optional<Value> val = Models.getProperty(metadata, column, CSVW.DATATYPE); | ||
if (val.isPresent()) { | ||
Value datatype = val.get(); | ||
// derived datatype | ||
if (datatype.isBNode()) { | ||
val = Models.getProperty(metadata, (Resource) datatype, CSVW.BASE); | ||
} | ||
} | ||
if (!val.isPresent()) { | ||
return CoreDatatype.XSD.STRING.getIri(); | ||
} | ||
Value datatype = val.get(); | ||
if (datatype.isIRI()) { | ||
return (IRI) datatype; | ||
} | ||
return CoreDatatype.XSD.valueOf(datatype.stringValue().toUpperCase()).getIri(); | ||
} | ||
|
||
|
||
/** | ||
* Get format string, e.g date format | ||
* | ||
* @param metadata | ||
* @param column | ||
* @return | ||
*/ | ||
private static Optional<String> getFormat(Model metadata, Resource column) { | ||
Optional<Value> val = Models.getProperty(metadata, column, CSVW.DATATYPE); | ||
if (val.isPresent() && val.get().isBNode()) { | ||
val = Models.getProperty(metadata, (Resource) val.get(), CSVW.FORMAT); | ||
if (val.isPresent() && val.get().isLiteral()) { | ||
return Optional.of(val.get().stringValue()); | ||
} | ||
} | ||
return Optional.empty(); | ||
} | ||
|
||
/** | ||
* Get parser for specific column | ||
* | ||
* @param metadata | ||
* @param column | ||
* @return | ||
*/ | ||
protected static CellParser getCellParser(Model metadata, Resource column) { | ||
IRI datatype = getDatatypeIRI(metadata, column); | ||
|
||
CellParser parser = CellParserFactory.create(datatype); | ||
|
||
Models.getPropertyString(metadata, column, CSVW.NAME) | ||
.ifPresentOrElse(v -> parser.setName(v), | ||
() -> new RDFParseException("Metadata file does not contain name for column " + column)); | ||
|
||
Models.getPropertyString(metadata, column, CSVW.DEFAULT).ifPresent(v -> parser.setDefaultValue(v)); | ||
Models.getPropertyString(metadata, column, CSVW.REQUIRED) | ||
.ifPresent(v -> parser.setRequired(Boolean.parseBoolean(v))); | ||
Models.getPropertyString(metadata, column, CSVW.VIRTUAL) | ||
.ifPresent(v -> parser.setVirtual(Boolean.parseBoolean(v))); | ||
Models.getPropertyString(metadata, column, CSVW.SUPPRESS_OUTPUT) | ||
.ifPresent(v -> parser.setVirtual(Boolean.parseBoolean(v))); | ||
|
||
// only useful for strings | ||
Models.getPropertyString(metadata, column, CSVW.LANG).ifPresent(v -> parser.setLang(v)); | ||
|
||
// only useful for numeric | ||
Models.getPropertyString(metadata, column, CSVW.DECIMAL_CHAR).ifPresent(v -> parser.setDecimalChar(v)); | ||
Models.getPropertyString(metadata, column, CSVW.GROUP_CHAR).ifPresent(v -> parser.setGroupChar(v)); | ||
|
||
// mostly for date formats | ||
getFormat(metadata, column).ifPresent(v -> parser.setFormat(v)); | ||
|
||
Models.getPropertyString(metadata, column, CSVW.TRIM) | ||
.ifPresent(v -> parser.setVirtual(Boolean.parseBoolean(v))); | ||
|
||
Models.getPropertyString(metadata, column, CSVW.VALUE_URL).ifPresent(v -> parser.setValueUrl(v)); | ||
|
||
// use a property from a vocabulary as predicate, or create a property relative to the namespace of the CSV | ||
Optional<String> propertyURL = Models.getPropertyString(metadata, column, CSVW.PROPERTY_URL); | ||
String s = propertyURL.isPresent() ? propertyURL.get() : "_local:" + parser.getName(); | ||
parser.setPropertyIRI(metadata.getNamespaces(), s); | ||
|
||
return parser; | ||
} | ||
} |