001/** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.commons.rdf.experimental; 019 020import java.io.IOException; 021import java.io.InputStream; 022import java.nio.charset.StandardCharsets; 023import java.nio.file.Path; 024import java.util.Optional; 025import java.util.concurrent.Future; 026import java.util.function.Consumer; 027 028import org.apache.commons.rdf.api.BlankNode; 029import org.apache.commons.rdf.api.Dataset; 030import org.apache.commons.rdf.api.Graph; 031import org.apache.commons.rdf.api.IRI; 032import org.apache.commons.rdf.api.Quad; 033import org.apache.commons.rdf.api.RDFSyntax; 034import org.apache.commons.rdf.api.RDFTerm; 035import org.apache.commons.rdf.api.RDF; 036import org.apache.commons.rdf.api.Triple; 037 038/** 039 * Parse an RDF source into a target (e.g. a Graph/Dataset). 040 * <h2>Experimental</h2> This interface (and its implementations) should be 041 * considered <strong>at risk</strong>; they might change or be removed in the 042 * next minor update of Commons RDF. It may move to the the 043 * {@link org.apache.commons.rdf.api} package when it has stabilized. 044 * <h2>Description</h2> 045 * <p> 046 * This interface follows the 047 * <a href="https://en.wikipedia.org/wiki/Builder_pattern">Builder pattern</a>, 048 * allowing to set parser settings like {@link #contentType(RDFSyntax)} and 049 * {@link #base(IRI)}. A caller MUST call one of the <code>source</code> methods 050 * (e.g. {@link #source(IRI)}, {@link #source(Path)}, 051 * {@link #source(InputStream)}), and MUST call one of the <code>target</code> 052 * methods (e.g. {@link #target(Consumer)}, {@link #target(Dataset)}, 053 * {@link #target(Graph)}) before calling {@link #parse()} on the returned 054 * RDFParser - however methods can be called in any order. 055 * <p> 056 * The call to {@link #parse()} returns a {@link Future}, allowing asynchronous 057 * parse operations. Callers are recommended to check {@link Future#get()} to 058 * ensure parsing completed successfully, or catch exceptions thrown during 059 * parsing. 060 * <p> 061 * Setting a method that has already been set will override any existing value 062 * in the returned builder - regardless of the parameter type (e.g. 063 * {@link #source(IRI)} will override a previous {@link #source(Path)}. Settings 064 * can be unset by passing <code>null</code> - note that this may require 065 * casting, e.g. <code>contentType( (RDFSyntax) null )</code> to undo a previous 066 * call to {@link #contentType(RDFSyntax)}. 067 * <p> 068 * It is undefined if a RDFParser is mutable or thread-safe, so callers should 069 * always use the returned modified RDFParser from the builder methods. The 070 * builder may return itself after modification, or a cloned builder with the 071 * modified settings applied. Implementations are however encouraged to be 072 * immutable, thread-safe and document this. As an example starting point, see 073 * <code>org.apache.commons.rdf.simple.AbstractRDFParser</code>. 074 * <p> 075 * Example usage: 076 * </p> 077 * 078 * <pre> 079 * Graph g1 = rDFTermFactory.createGraph(); 080 * new ExampleRDFParserBuilder().source(Paths.get("/tmp/graph.ttl")).contentType(RDFSyntax.TURTLE).target(g1).parse() 081 * .get(30, TimeUnit.Seconds); 082 * </pre> 083 * 084 */ 085public interface RDFParser { 086 087 /** 088 * The result of {@link RDFParser#parse()} indicating parsing completed. 089 * <p> 090 * This is a marker interface that may be subclassed to include parser 091 * details, e.g. warning messages or triple counts. 092 */ 093 interface ParseResult { 094 } 095 096 /** 097 * Specify which {@link RDF} to use for generating {@link RDFTerm}s. 098 * <p> 099 * This option may be used together with {@link #target(Graph)} to override 100 * the implementation's default factory and graph. 101 * <p> 102 * <strong>Warning:</strong> Using the same {@link RDF} for multiple 103 * {@link #parse()} calls may accidentally merge {@link BlankNode}s having 104 * the same label, as the parser may use the 105 * {@link RDF#createBlankNode(String)} method from the parsed blank node 106 * labels. 107 * 108 * @see #target(Graph) 109 * @param rdfTermFactory 110 * {@link RDF} to use for generating RDFTerms. 111 * @return An {@link RDFParser} that will use the specified rdfTermFactory 112 */ 113 RDFParser rdfTermFactory(RDF rdfTermFactory); 114 115 /** 116 * Specify the content type of the RDF syntax to parse. 117 * <p> 118 * This option can be used to select the RDFSyntax of the source, overriding 119 * any <code>Content-Type</code> headers or equivalent. 120 * <p> 121 * The character set of the RDFSyntax is assumed to be 122 * {@link StandardCharsets#UTF_8} unless overridden within the document 123 * (e.g. {@code <?xml version="1.0" encoding="iso-8859-1"?>} in 124 * {@link RDFSyntax#RDFXML}). 125 * <p> 126 * This method will override any contentType set with 127 * {@link #contentType(String)}. 128 * 129 * @see #contentType(String) 130 * @param rdfSyntax 131 * An {@link RDFSyntax} to parse the source according to, e.g. 132 * {@link RDFSyntax#TURTLE}. 133 * @throws IllegalArgumentException 134 * If this RDFParser does not support the specified RDFSyntax. 135 * @return An {@link RDFParser} that will use the specified content type. 136 */ 137 RDFParser contentType(RDFSyntax rdfSyntax) throws IllegalArgumentException; 138 139 /** 140 * Specify the content type of the RDF syntax to parse. 141 * <p> 142 * This option can be used to select the RDFSyntax of the source, overriding 143 * any <code>Content-Type</code> headers or equivalent. 144 * <p> 145 * The content type MAY include a <code>charset</code> parameter if the RDF 146 * media types permit it; the default charset is 147 * {@link StandardCharsets#UTF_8} unless overridden within the document. 148 * <p> 149 * This method will override any contentType set with 150 * {@link #contentType(RDFSyntax)}. 151 * 152 * @see #contentType(RDFSyntax) 153 * @param contentType 154 * A content-type string, e.g. <code>application/ld+json</code> 155 * or <code>text/turtle;charset="UTF-8"</code> as specified by 156 * <a href="https://tools.ietf.org/html/rfc7231#section-3.1.1.1"> 157 * RFC7231</a>. 158 * @return An {@link RDFParser} that will use the specified content type. 159 * @throws IllegalArgumentException 160 * If the contentType has an invalid syntax, or this RDFParser 161 * does not support the specified contentType. 162 */ 163 RDFParser contentType(String contentType) throws IllegalArgumentException; 164 165 /** 166 * Specify a {@link Graph} to add parsed triples to. 167 * <p> 168 * If the source supports datasets (e.g. the {@link #contentType(RDFSyntax)} 169 * set has {@link RDFSyntax#supportsDataset} is true)), then only quads in 170 * the <em>default graph</em> will be added to the Graph as {@link Triple}s. 171 * <p> 172 * It is undefined if any triples are added to the specified {@link Graph} 173 * if {@link #parse()} throws any exceptions. (However implementations are 174 * free to prevent this using transaction mechanisms or similar). If 175 * {@link Future#get()} does not indicate an exception, the parser 176 * implementation SHOULD have inserted all parsed triples to the specified 177 * graph. 178 * <p> 179 * Calling this method will override any earlier targets set with 180 * {@link #target(Graph)}, {@link #target(Consumer)} or 181 * {@link #target(Dataset)}. 182 * <p> 183 * The default implementation of this method calls {@link #target(Consumer)} 184 * with a {@link Consumer} that does {@link Graph#add(Triple)} with 185 * {@link Quad#asTriple()} if the quad is in the default graph. 186 * 187 * @param graph 188 * The {@link Graph} to add triples to. 189 * @return An {@link RDFParser} that will insert triples into the specified 190 * graph. 191 */ 192 default RDFParser target(final Graph graph) { 193 return target(q -> { 194 if (!q.getGraphName().isPresent()) { 195 graph.add(q.asTriple()); 196 } 197 }); 198 } 199 200 /** 201 * Specify a {@link Dataset} to add parsed quads to. 202 * <p> 203 * It is undefined if any quads are added to the specified {@link Dataset} 204 * if {@link #parse()} throws any exceptions. (However implementations are 205 * free to prevent this using transaction mechanisms or similar). On the 206 * other hand, if {@link #parse()} does not indicate an exception, the 207 * implementation SHOULD have inserted all parsed quads to the specified 208 * dataset. 209 * <p> 210 * Calling this method will override any earlier targets set with 211 * {@link #target(Graph)}, {@link #target(Consumer)} or 212 * {@link #target(Dataset)}. 213 * <p> 214 * The default implementation of this method calls {@link #target(Consumer)} 215 * with a {@link Consumer} that does {@link Dataset#add(Quad)}. 216 * 217 * @param dataset 218 * The {@link Dataset} to add quads to. 219 * @return An {@link RDFParser} that will insert triples into the specified 220 * dataset. 221 */ 222 default RDFParser target(final Dataset dataset) { 223 return target(dataset::add); 224 } 225 226 /** 227 * Specify a consumer for parsed quads. 228 * <p> 229 * The quads will include triples in all named graphs of the parsed source, 230 * including any triples in the default graph. When parsing a source format 231 * which do not support datasets, all quads delivered to the consumer will 232 * be in the default graph (e.g. their {@link Quad#getGraphName()} will be 233 * as {@link Optional#empty()}), while for a source 234 * <p> 235 * It is undefined if any quads are consumed if {@link #parse()} throws any 236 * exceptions. On the other hand, if {@link #parse()} does not indicate an 237 * exception, the implementation SHOULD have produced all parsed quads to 238 * the specified consumer. 239 * <p> 240 * Calling this method will override any earlier targets set with 241 * {@link #target(Graph)}, {@link #target(Consumer)} or 242 * {@link #target(Dataset)}. 243 * <p> 244 * The consumer is not assumed to be thread safe - only one 245 * {@link Consumer#accept(Object)} is delivered at a time for a given 246 * {@link RDFParser#parse()} call. 247 * <p> 248 * This method is typically called with a functional consumer, for example: 249 * 250 * <pre> 251 * {@code 252 * List<Quad> quads = new ArrayList<Quad>; 253 * parserBuilder.target(quads::add).parse(); 254 * } 255 * </pre> 256 * 257 * @param consumer 258 * A {@link Consumer} of {@link Quad}s 259 * @return An {@link RDFParser} that will call the consumer for into the 260 * specified dataset. 261 */ 262 RDFParser target(Consumer<Quad> consumer); 263 264 /** 265 * Specify a base IRI to use for parsing any relative IRI references. 266 * <p> 267 * Setting this option will override any protocol-specific base IRI (e.g. 268 * <code>Content-Location</code> header) or the {@link #source(IRI)} IRI, 269 * but does not override any base IRIs set within the source document (e.g. 270 * <code>@base</code> in Turtle documents). 271 * <p> 272 * If the source is in a syntax that does not support relative IRI 273 * references (e.g. {@link RDFSyntax#NTRIPLES}), setting the 274 * <code>base</code> has no effect. 275 * <p> 276 * This method will override any base IRI set with {@link #base(String)}. 277 * 278 * @see #base(String) 279 * @param base 280 * An absolute IRI to use as a base. 281 * @return An {@link RDFParser} that will use the specified base IRI. 282 */ 283 RDFParser base(IRI base); 284 285 /** 286 * Specify a base IRI to use for parsing any relative IRI references. 287 * <p> 288 * Setting this option will override any protocol-specific base IRI (e.g. 289 * <code>Content-Location</code> header) or the {@link #source(IRI)} IRI, 290 * but does not override any base IRIs set within the source document (e.g. 291 * <code>@base</code> in Turtle documents). 292 * <p> 293 * If the source is in a syntax that does not support relative IRI 294 * references (e.g. {@link RDFSyntax#NTRIPLES}), setting the 295 * <code>base</code> has no effect. 296 * <p> 297 * This method will override any base IRI set with {@link #base(IRI)}. 298 * 299 * @see #base(IRI) 300 * @param base 301 * An absolute IRI to use as a base. 302 * @return An {@link RDFParser} that will use the specified base IRI. 303 * @throws IllegalArgumentException 304 * If the base is not a valid absolute IRI string 305 */ 306 RDFParser base(String base) throws IllegalArgumentException; 307 308 /** 309 * Specify a source {@link InputStream} to parse. 310 * <p> 311 * The source set will not be read before the call to {@link #parse()}. 312 * <p> 313 * The InputStream will not be closed after parsing. The InputStream does 314 * not need to support {@link InputStream#markSupported()}. 315 * <p> 316 * The parser might not consume the complete stream (e.g. an RDF/XML parser 317 * may not read beyond the closing tag of 318 * <code></rdf:Description></code>). 319 * <p> 320 * The {@link #contentType(RDFSyntax)} or {@link #contentType(String)} 321 * SHOULD be set before calling {@link #parse()}. 322 * <p> 323 * The character set is assumed to be {@link StandardCharsets#UTF_8} unless 324 * the {@link #contentType(String)} specifies otherwise or the document 325 * declares its own charset (e.g. RDF/XML with a 326 * <code><?xml encoding="iso-8859-1"></code> header). 327 * <p> 328 * The {@link #base(IRI)} or {@link #base(String)} MUST be set before 329 * calling {@link #parse()}, unless the RDF syntax does not permit relative 330 * IRIs (e.g. {@link RDFSyntax#NTRIPLES}). 331 * <p> 332 * This method will override any source set with {@link #source(IRI)}, 333 * {@link #source(Path)} or {@link #source(String)}. 334 * 335 * @param inputStream 336 * An InputStream to consume 337 * @return An {@link RDFParser} that will use the specified source. 338 */ 339 RDFParser source(InputStream inputStream); 340 341 /** 342 * Specify a source file {@link Path} to parse. 343 * <p> 344 * The source set will not be read before the call to {@link #parse()}. 345 * <p> 346 * The {@link #contentType(RDFSyntax)} or {@link #contentType(String)} 347 * SHOULD be set before calling {@link #parse()}. 348 * <p> 349 * The character set is assumed to be {@link StandardCharsets#UTF_8} unless 350 * the {@link #contentType(String)} specifies otherwise or the document 351 * declares its own charset (e.g. RDF/XML with a 352 * <code><?xml encoding="iso-8859-1"></code> header). 353 * <p> 354 * The {@link #base(IRI)} or {@link #base(String)} MAY be set before calling 355 * {@link #parse()}, otherwise {@link Path#toUri()} will be used as the base 356 * IRI. 357 * <p> 358 * This method will override any source set with {@link #source(IRI)}, 359 * {@link #source(InputStream)} or {@link #source(String)}. 360 * 361 * @param file 362 * A Path for a file to parse 363 * @return An {@link RDFParser} that will use the specified source. 364 */ 365 RDFParser source(Path file); 366 367 /** 368 * Specify an absolute source {@link IRI} to retrieve and parse. 369 * <p> 370 * The source set will not be read before the call to {@link #parse()}. 371 * <p> 372 * If this builder does not support the given IRI protocol (e.g. 373 * <code>urn:uuid:ce667463-c5ab-4c23-9b64-701d055c4890</code>), this method 374 * should succeed, while the {@link #parse()} should throw an 375 * {@link IOException}. 376 * <p> 377 * The {@link #contentType(RDFSyntax)} or {@link #contentType(String)} MAY 378 * be set before calling {@link #parse()}, in which case that type MAY be 379 * used for content negotiation (e.g. <code>Accept</code> header in HTTP), 380 * and SHOULD be used for selecting the RDFSyntax. 381 * <p> 382 * The character set is assumed to be {@link StandardCharsets#UTF_8} unless 383 * the protocol's equivalent of <code>Content-Type</code> specifies 384 * otherwise or the document declares its own charset (e.g. RDF/XML with a 385 * <code><?xml encoding="iso-8859-1"></code> header). 386 * <p> 387 * The {@link #base(IRI)} or {@link #base(String)} MAY be set before calling 388 * {@link #parse()}, otherwise the source IRI will be used as the base IRI. 389 * <p> 390 * This method will override any source set with {@link #source(Path)}, 391 * {@link #source(InputStream)} or {@link #source(String)}. 392 * 393 * @param iri 394 * An IRI to retrieve and parse 395 * @return An {@link RDFParser} that will use the specified source. 396 */ 397 RDFParser source(IRI iri); 398 399 /** 400 * Specify an absolute source IRI to retrieve and parse. 401 * <p> 402 * The source set will not be read before the call to {@link #parse()}. 403 * <p> 404 * If this builder does not support the given IRI (e.g. 405 * <code>urn:uuid:ce667463-c5ab-4c23-9b64-701d055c4890</code>), this method 406 * should succeed, while the {@link #parse()} should throw an 407 * {@link IOException}. 408 * <p> 409 * The {@link #contentType(RDFSyntax)} or {@link #contentType(String)} MAY 410 * be set before calling {@link #parse()}, in which case that type MAY be 411 * used for content negotiation (e.g. <code>Accept</code> header in HTTP), 412 * and SHOULD be used for selecting the RDFSyntax. 413 * <p> 414 * The character set is assumed to be {@link StandardCharsets#UTF_8} unless 415 * the protocol's equivalent of <code>Content-Type</code> specifies 416 * otherwise or the document declares its own charset (e.g. RDF/XML with a 417 * <code><?xml encoding="iso-8859-1"></code> header). 418 * <p> 419 * The {@link #base(IRI)} or {@link #base(String)} MAY be set before calling 420 * {@link #parse()}, otherwise the source IRI will be used as the base IRI. 421 * <p> 422 * This method will override any source set with {@link #source(Path)}, 423 * {@link #source(InputStream)} or {@link #source(IRI)}. 424 * 425 * @param iri 426 * An IRI to retrieve and parse 427 * @return An {@link RDFParser} that will use the specified source. 428 * @throws IllegalArgumentException 429 * If the base is not a valid absolute IRI string 430 * 431 */ 432 RDFParser source(String iri) throws IllegalArgumentException; 433 434 /** 435 * Parse the specified source. 436 * <p> 437 * A source method (e.g. {@link #source(InputStream)}, {@link #source(IRI)}, 438 * {@link #source(Path)}, {@link #source(String)} or an equivalent subclass 439 * method) MUST have been called before calling this method, otherwise an 440 * {@link IllegalStateException} will be thrown. 441 * <p> 442 * A target method (e.g. {@link #target(Consumer)}, 443 * {@link #target(Dataset)}, {@link #target(Graph)} or an equivalent 444 * subclass method) MUST have been called before calling parse(), otherwise 445 * an {@link IllegalStateException} will be thrown. 446 * <p> 447 * It is undefined if this method is thread-safe, however the 448 * {@link RDFParser} may be reused (e.g. setting a different source) as soon 449 * as the {@link Future} has been returned from this method. 450 * <p> 451 * The RDFParser SHOULD perform the parsing as an asynchronous operation, 452 * and return the {@link Future} as soon as preliminary checks (such as 453 * validity of the {@link #source(IRI)} and {@link #contentType(RDFSyntax)} 454 * settings) have finished. The future SHOULD not mark 455 * {@link Future#isDone()} before parsing is complete. A synchronous 456 * implementation MAY be blocking on the <code>parse()</code> call and 457 * return a Future that is already {@link Future#isDone()}. 458 * <p> 459 * The returned {@link Future} contains a {@link ParseResult}. 460 * Implementations may subclass this interface to provide any parser 461 * details, e.g. list of warnings. <code>null</code> is a possible return 462 * value if no details are available, but parsing succeeded. 463 * <p> 464 * If an exception occurs during parsing, (e.g. {@link IOException} or 465 * <code>org.apache.commons.rdf.simple.experimental.RDFParseException</code>), 466 * it should be indicated as the 467 * {@link java.util.concurrent.ExecutionException#getCause()} in the 468 * {@link java.util.concurrent.ExecutionException} thrown on 469 * {@link Future#get()}. 470 * 471 * @return A Future that will return the populated {@link Graph} when the 472 * parsing has finished. 473 * @throws IOException 474 * If an error occurred while starting to read the source (e.g. 475 * file not found, unsupported IRI protocol). Note that IO 476 * errors during parsing would instead be the 477 * {@link java.util.concurrent.ExecutionException#getCause()} of 478 * the {@link java.util.concurrent.ExecutionException} thrown on 479 * {@link Future#get()}. 480 * @throws IllegalStateException 481 * If the builder is in an invalid state, e.g. a 482 * <code>source</code> has not been set. 483 */ 484 Future<? extends ParseResult> parse() throws IOException, IllegalStateException; 485}