001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements. See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership. The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.commons.rdf.experimental;
019
020import java.io.IOException;
021import java.io.InputStream;
022import java.nio.charset.StandardCharsets;
023import java.nio.file.Path;
024import java.util.Optional;
025import java.util.concurrent.Future;
026import java.util.function.Consumer;
027
028import org.apache.commons.rdf.api.BlankNode;
029import org.apache.commons.rdf.api.Dataset;
030import org.apache.commons.rdf.api.Graph;
031import org.apache.commons.rdf.api.IRI;
032import org.apache.commons.rdf.api.Quad;
033import org.apache.commons.rdf.api.RDFSyntax;
034import org.apache.commons.rdf.api.RDFTerm;
035import org.apache.commons.rdf.api.RDF;
036import org.apache.commons.rdf.api.Triple;
037
038/**
039 * Parse an RDF source into a target (e.g. a Graph/Dataset).
040 * <h2>Experimental</h2> This interface (and its implementations) should be
041 * considered <strong>at risk</strong>; they might change or be removed in the
042 * next minor update of Commons RDF. It may move to the the
043 * {@link org.apache.commons.rdf.api} package when it has stabilized.
044 * <h2>Description</h2>
045 * <p>
046 * This interface follows the
047 * <a href="https://en.wikipedia.org/wiki/Builder_pattern">Builder pattern</a>,
048 * allowing to set parser settings like {@link #contentType(RDFSyntax)} and
049 * {@link #base(IRI)}. A caller MUST call one of the <code>source</code> methods
050 * (e.g. {@link #source(IRI)}, {@link #source(Path)},
051 * {@link #source(InputStream)}), and MUST call one of the <code>target</code>
052 * methods (e.g. {@link #target(Consumer)}, {@link #target(Dataset)},
053 * {@link #target(Graph)}) before calling {@link #parse()} on the returned
054 * RDFParser - however methods can be called in any order.
055 * <p>
056 * The call to {@link #parse()} returns a {@link Future}, allowing asynchronous
057 * parse operations. Callers are recommended to check {@link Future#get()} to
058 * ensure parsing completed successfully, or catch exceptions thrown during
059 * parsing.
060 * <p>
061 * Setting a method that has already been set will override any existing value
062 * in the returned builder - regardless of the parameter type (e.g.
063 * {@link #source(IRI)} will override a previous {@link #source(Path)}. Settings
064 * can be unset by passing <code>null</code> - note that this may require
065 * casting, e.g. <code>contentType( (RDFSyntax) null )</code> to undo a previous
066 * call to {@link #contentType(RDFSyntax)}.
067 * <p>
068 * It is undefined if a RDFParser is mutable or thread-safe, so callers should
069 * always use the returned modified RDFParser from the builder methods. The
070 * builder may return itself after modification, or a cloned builder with the
071 * modified settings applied. Implementations are however encouraged to be
072 * immutable, thread-safe and document this. As an example starting point, see
073 * <code>org.apache.commons.rdf.simple.AbstractRDFParser</code>.
074 * <p>
075 * Example usage:
076 * </p>
077 *
078 * <pre>
079 * Graph g1 = rDFTermFactory.createGraph();
080 * new ExampleRDFParserBuilder().source(Paths.get("/tmp/graph.ttl")).contentType(RDFSyntax.TURTLE).target(g1).parse()
081 *         .get(30, TimeUnit.Seconds);
082 * </pre>
083 *
084 */
085public interface RDFParser {
086
087    /**
088     * The result of {@link RDFParser#parse()} indicating parsing completed.
089     * <p>
090     * This is a marker interface that may be subclassed to include parser
091     * details, e.g. warning messages or triple counts.
092     */
093    interface ParseResult {
094    }
095
096    /**
097     * Specify which {@link RDF} to use for generating {@link RDFTerm}s.
098     * <p>
099     * This option may be used together with {@link #target(Graph)} to override
100     * the implementation's default factory and graph.
101     * <p>
102     * <strong>Warning:</strong> Using the same {@link RDF} for multiple
103     * {@link #parse()} calls may accidentally merge {@link BlankNode}s having
104     * the same label, as the parser may use the
105     * {@link RDF#createBlankNode(String)} method from the parsed blank node
106     * labels.
107     *
108     * @see #target(Graph)
109     * @param rdfTermFactory
110     *            {@link RDF} to use for generating RDFTerms.
111     * @return An {@link RDFParser} that will use the specified rdfTermFactory
112     */
113    RDFParser rdfTermFactory(RDF rdfTermFactory);
114
115    /**
116     * Specify the content type of the RDF syntax to parse.
117     * <p>
118     * This option can be used to select the RDFSyntax of the source, overriding
119     * any <code>Content-Type</code> headers or equivalent.
120     * <p>
121     * The character set of the RDFSyntax is assumed to be
122     * {@link StandardCharsets#UTF_8} unless overridden within the document
123     * (e.g. {@code <?xml version="1.0" encoding="iso-8859-1"?>} in
124     * {@link RDFSyntax#RDFXML}).
125     * <p>
126     * This method will override any contentType set with
127     * {@link #contentType(String)}.
128     *
129     * @see #contentType(String)
130     * @param rdfSyntax
131     *            An {@link RDFSyntax} to parse the source according to, e.g.
132     *            {@link RDFSyntax#TURTLE}.
133     * @throws IllegalArgumentException
134     *             If this RDFParser does not support the specified RDFSyntax.
135     * @return An {@link RDFParser} that will use the specified content type.
136     */
137    RDFParser contentType(RDFSyntax rdfSyntax) throws IllegalArgumentException;
138
139    /**
140     * Specify the content type of the RDF syntax to parse.
141     * <p>
142     * This option can be used to select the RDFSyntax of the source, overriding
143     * any <code>Content-Type</code> headers or equivalent.
144     * <p>
145     * The content type MAY include a <code>charset</code> parameter if the RDF
146     * media types permit it; the default charset is
147     * {@link StandardCharsets#UTF_8} unless overridden within the document.
148     * <p>
149     * This method will override any contentType set with
150     * {@link #contentType(RDFSyntax)}.
151     *
152     * @see #contentType(RDFSyntax)
153     * @param contentType
154     *            A content-type string, e.g. <code>application/ld+json</code>
155     *            or <code>text/turtle;charset="UTF-8"</code> as specified by
156     *            <a href="https://tools.ietf.org/html/rfc7231#section-3.1.1.1">
157     *            RFC7231</a>.
158     * @return An {@link RDFParser} that will use the specified content type.
159     * @throws IllegalArgumentException
160     *             If the contentType has an invalid syntax, or this RDFParser
161     *             does not support the specified contentType.
162     */
163    RDFParser contentType(String contentType) throws IllegalArgumentException;
164
165    /**
166     * Specify a {@link Graph} to add parsed triples to.
167     * <p>
168     * If the source supports datasets (e.g. the {@link #contentType(RDFSyntax)}
169     * set has {@link RDFSyntax#supportsDataset} is true)), then only quads in
170     * the <em>default graph</em> will be added to the Graph as {@link Triple}s.
171     * <p>
172     * It is undefined if any triples are added to the specified {@link Graph}
173     * if {@link #parse()} throws any exceptions. (However implementations are
174     * free to prevent this using transaction mechanisms or similar). If
175     * {@link Future#get()} does not indicate an exception, the parser
176     * implementation SHOULD have inserted all parsed triples to the specified
177     * graph.
178     * <p>
179     * Calling this method will override any earlier targets set with
180     * {@link #target(Graph)}, {@link #target(Consumer)} or
181     * {@link #target(Dataset)}.
182     * <p>
183     * The default implementation of this method calls {@link #target(Consumer)}
184     * with a {@link Consumer} that does {@link Graph#add(Triple)} with
185     * {@link Quad#asTriple()} if the quad is in the default graph.
186     *
187     * @param graph
188     *            The {@link Graph} to add triples to.
189     * @return An {@link RDFParser} that will insert triples into the specified
190     *         graph.
191     */
192    default RDFParser target(final Graph graph) {
193        return target(q -> {
194            if (!q.getGraphName().isPresent()) {
195                graph.add(q.asTriple());
196            }
197        });
198    }
199
200    /**
201     * Specify a {@link Dataset} to add parsed quads to.
202     * <p>
203     * It is undefined if any quads are added to the specified {@link Dataset}
204     * if {@link #parse()} throws any exceptions. (However implementations are
205     * free to prevent this using transaction mechanisms or similar). On the
206     * other hand, if {@link #parse()} does not indicate an exception, the
207     * implementation SHOULD have inserted all parsed quads to the specified
208     * dataset.
209     * <p>
210     * Calling this method will override any earlier targets set with
211     * {@link #target(Graph)}, {@link #target(Consumer)} or
212     * {@link #target(Dataset)}.
213     * <p>
214     * The default implementation of this method calls {@link #target(Consumer)}
215     * with a {@link Consumer} that does {@link Dataset#add(Quad)}.
216     *
217     * @param dataset
218     *            The {@link Dataset} to add quads to.
219     * @return An {@link RDFParser} that will insert triples into the specified
220     *         dataset.
221     */
222    default RDFParser target(final Dataset dataset) {
223        return target(dataset::add);
224    }
225
226    /**
227     * Specify a consumer for parsed quads.
228     * <p>
229     * The quads will include triples in all named graphs of the parsed source,
230     * including any triples in the default graph. When parsing a source format
231     * which do not support datasets, all quads delivered to the consumer will
232     * be in the default graph (e.g. their {@link Quad#getGraphName()} will be
233     * as {@link Optional#empty()}), while for a source
234     * <p>
235     * It is undefined if any quads are consumed if {@link #parse()} throws any
236     * exceptions. On the other hand, if {@link #parse()} does not indicate an
237     * exception, the implementation SHOULD have produced all parsed quads to
238     * the specified consumer.
239     * <p>
240     * Calling this method will override any earlier targets set with
241     * {@link #target(Graph)}, {@link #target(Consumer)} or
242     * {@link #target(Dataset)}.
243     * <p>
244     * The consumer is not assumed to be thread safe - only one
245     * {@link Consumer#accept(Object)} is delivered at a time for a given
246     * {@link RDFParser#parse()} call.
247     * <p>
248     * This method is typically called with a functional consumer, for example:
249     *
250     * <pre>
251     * {@code
252     * List<Quad> quads = new ArrayList<Quad>;
253     * parserBuilder.target(quads::add).parse();
254     * }
255     * </pre>
256     *
257     * @param consumer
258     *            A {@link Consumer} of {@link Quad}s
259     * @return An {@link RDFParser} that will call the consumer for into the
260     *         specified dataset.
261     */
262    RDFParser target(Consumer<Quad> consumer);
263
264    /**
265     * Specify a base IRI to use for parsing any relative IRI references.
266     * <p>
267     * Setting this option will override any protocol-specific base IRI (e.g.
268     * <code>Content-Location</code> header) or the {@link #source(IRI)} IRI,
269     * but does not override any base IRIs set within the source document (e.g.
270     * <code>@base</code> in Turtle documents).
271     * <p>
272     * If the source is in a syntax that does not support relative IRI
273     * references (e.g. {@link RDFSyntax#NTRIPLES}), setting the
274     * <code>base</code> has no effect.
275     * <p>
276     * This method will override any base IRI set with {@link #base(String)}.
277     *
278     * @see #base(String)
279     * @param base
280     *            An absolute IRI to use as a base.
281     * @return An {@link RDFParser} that will use the specified base IRI.
282     */
283    RDFParser base(IRI base);
284
285    /**
286     * Specify a base IRI to use for parsing any relative IRI references.
287     * <p>
288     * Setting this option will override any protocol-specific base IRI (e.g.
289     * <code>Content-Location</code> header) or the {@link #source(IRI)} IRI,
290     * but does not override any base IRIs set within the source document (e.g.
291     * <code>@base</code> in Turtle documents).
292     * <p>
293     * If the source is in a syntax that does not support relative IRI
294     * references (e.g. {@link RDFSyntax#NTRIPLES}), setting the
295     * <code>base</code> has no effect.
296     * <p>
297     * This method will override any base IRI set with {@link #base(IRI)}.
298     *
299     * @see #base(IRI)
300     * @param base
301     *            An absolute IRI to use as a base.
302     * @return An {@link RDFParser} that will use the specified base IRI.
303     * @throws IllegalArgumentException
304     *             If the base is not a valid absolute IRI string
305     */
306    RDFParser base(String base) throws IllegalArgumentException;
307
308    /**
309     * Specify a source {@link InputStream} to parse.
310     * <p>
311     * The source set will not be read before the call to {@link #parse()}.
312     * <p>
313     * The InputStream will not be closed after parsing. The InputStream does
314     * not need to support {@link InputStream#markSupported()}.
315     * <p>
316     * The parser might not consume the complete stream (e.g. an RDF/XML parser
317     * may not read beyond the closing tag of
318     * <code>&lt;/rdf:Description&gt;</code>).
319     * <p>
320     * The {@link #contentType(RDFSyntax)} or {@link #contentType(String)}
321     * SHOULD be set before calling {@link #parse()}.
322     * <p>
323     * The character set is assumed to be {@link StandardCharsets#UTF_8} unless
324     * the {@link #contentType(String)} specifies otherwise or the document
325     * declares its own charset (e.g. RDF/XML with a
326     * <code>&lt;?xml encoding="iso-8859-1"&gt;</code> header).
327     * <p>
328     * The {@link #base(IRI)} or {@link #base(String)} MUST be set before
329     * calling {@link #parse()}, unless the RDF syntax does not permit relative
330     * IRIs (e.g. {@link RDFSyntax#NTRIPLES}).
331     * <p>
332     * This method will override any source set with {@link #source(IRI)},
333     * {@link #source(Path)} or {@link #source(String)}.
334     *
335     * @param inputStream
336     *            An InputStream to consume
337     * @return An {@link RDFParser} that will use the specified source.
338     */
339    RDFParser source(InputStream inputStream);
340
341    /**
342     * Specify a source file {@link Path} to parse.
343     * <p>
344     * The source set will not be read before the call to {@link #parse()}.
345     * <p>
346     * The {@link #contentType(RDFSyntax)} or {@link #contentType(String)}
347     * SHOULD be set before calling {@link #parse()}.
348     * <p>
349     * The character set is assumed to be {@link StandardCharsets#UTF_8} unless
350     * the {@link #contentType(String)} specifies otherwise or the document
351     * declares its own charset (e.g. RDF/XML with a
352     * <code>&lt;?xml encoding="iso-8859-1"&gt;</code> header).
353     * <p>
354     * The {@link #base(IRI)} or {@link #base(String)} MAY be set before calling
355     * {@link #parse()}, otherwise {@link Path#toUri()} will be used as the base
356     * IRI.
357     * <p>
358     * This method will override any source set with {@link #source(IRI)},
359     * {@link #source(InputStream)} or {@link #source(String)}.
360     *
361     * @param file
362     *            A Path for a file to parse
363     * @return An {@link RDFParser} that will use the specified source.
364     */
365    RDFParser source(Path file);
366
367    /**
368     * Specify an absolute source {@link IRI} to retrieve and parse.
369     * <p>
370     * The source set will not be read before the call to {@link #parse()}.
371     * <p>
372     * If this builder does not support the given IRI protocol (e.g.
373     * <code>urn:uuid:ce667463-c5ab-4c23-9b64-701d055c4890</code>), this method
374     * should succeed, while the {@link #parse()} should throw an
375     * {@link IOException}.
376     * <p>
377     * The {@link #contentType(RDFSyntax)} or {@link #contentType(String)} MAY
378     * be set before calling {@link #parse()}, in which case that type MAY be
379     * used for content negotiation (e.g. <code>Accept</code> header in HTTP),
380     * and SHOULD be used for selecting the RDFSyntax.
381     * <p>
382     * The character set is assumed to be {@link StandardCharsets#UTF_8} unless
383     * the protocol's equivalent of <code>Content-Type</code> specifies
384     * otherwise or the document declares its own charset (e.g. RDF/XML with a
385     * <code>&lt;?xml encoding="iso-8859-1"&gt;</code> header).
386     * <p>
387     * The {@link #base(IRI)} or {@link #base(String)} MAY be set before calling
388     * {@link #parse()}, otherwise the source IRI will be used as the base IRI.
389     * <p>
390     * This method will override any source set with {@link #source(Path)},
391     * {@link #source(InputStream)} or {@link #source(String)}.
392     *
393     * @param iri
394     *            An IRI to retrieve and parse
395     * @return An {@link RDFParser} that will use the specified source.
396     */
397    RDFParser source(IRI iri);
398
399    /**
400     * Specify an absolute source IRI to retrieve and parse.
401     * <p>
402     * The source set will not be read before the call to {@link #parse()}.
403     * <p>
404     * If this builder does not support the given IRI (e.g.
405     * <code>urn:uuid:ce667463-c5ab-4c23-9b64-701d055c4890</code>), this method
406     * should succeed, while the {@link #parse()} should throw an
407     * {@link IOException}.
408     * <p>
409     * The {@link #contentType(RDFSyntax)} or {@link #contentType(String)} MAY
410     * be set before calling {@link #parse()}, in which case that type MAY be
411     * used for content negotiation (e.g. <code>Accept</code> header in HTTP),
412     * and SHOULD be used for selecting the RDFSyntax.
413     * <p>
414     * The character set is assumed to be {@link StandardCharsets#UTF_8} unless
415     * the protocol's equivalent of <code>Content-Type</code> specifies
416     * otherwise or the document declares its own charset (e.g. RDF/XML with a
417     * <code>&lt;?xml encoding="iso-8859-1"&gt;</code> header).
418     * <p>
419     * The {@link #base(IRI)} or {@link #base(String)} MAY be set before calling
420     * {@link #parse()}, otherwise the source IRI will be used as the base IRI.
421     * <p>
422     * This method will override any source set with {@link #source(Path)},
423     * {@link #source(InputStream)} or {@link #source(IRI)}.
424     *
425     * @param iri
426     *            An IRI to retrieve and parse
427     * @return An {@link RDFParser} that will use the specified source.
428     * @throws IllegalArgumentException
429     *             If the base is not a valid absolute IRI string
430     *
431     */
432    RDFParser source(String iri) throws IllegalArgumentException;
433
434    /**
435     * Parse the specified source.
436     * <p>
437     * A source method (e.g. {@link #source(InputStream)}, {@link #source(IRI)},
438     * {@link #source(Path)}, {@link #source(String)} or an equivalent subclass
439     * method) MUST have been called before calling this method, otherwise an
440     * {@link IllegalStateException} will be thrown.
441     * <p>
442     * A target method (e.g. {@link #target(Consumer)},
443     * {@link #target(Dataset)}, {@link #target(Graph)} or an equivalent
444     * subclass method) MUST have been called before calling parse(), otherwise
445     * an {@link IllegalStateException} will be thrown.
446     * <p>
447     * It is undefined if this method is thread-safe, however the
448     * {@link RDFParser} may be reused (e.g. setting a different source) as soon
449     * as the {@link Future} has been returned from this method.
450     * <p>
451     * The RDFParser SHOULD perform the parsing as an asynchronous operation,
452     * and return the {@link Future} as soon as preliminary checks (such as
453     * validity of the {@link #source(IRI)} and {@link #contentType(RDFSyntax)}
454     * settings) have finished. The future SHOULD not mark
455     * {@link Future#isDone()} before parsing is complete. A synchronous
456     * implementation MAY be blocking on the <code>parse()</code> call and
457     * return a Future that is already {@link Future#isDone()}.
458     * <p>
459     * The returned {@link Future} contains a {@link ParseResult}.
460     * Implementations may subclass this interface to provide any parser
461     * details, e.g. list of warnings. <code>null</code> is a possible return
462     * value if no details are available, but parsing succeeded.
463     * <p>
464     * If an exception occurs during parsing, (e.g. {@link IOException} or
465     * <code>org.apache.commons.rdf.simple.experimental.RDFParseException</code>),
466     * it should be indicated as the
467     * {@link java.util.concurrent.ExecutionException#getCause()} in the
468     * {@link java.util.concurrent.ExecutionException} thrown on
469     * {@link Future#get()}.
470     *
471     * @return A Future that will return the populated {@link Graph} when the
472     *         parsing has finished.
473     * @throws IOException
474     *             If an error occurred while starting to read the source (e.g.
475     *             file not found, unsupported IRI protocol). Note that IO
476     *             errors during parsing would instead be the
477     *             {@link java.util.concurrent.ExecutionException#getCause()} of
478     *             the {@link java.util.concurrent.ExecutionException} thrown on
479     *             {@link Future#get()}.
480     * @throws IllegalStateException
481     *             If the builder is in an invalid state, e.g. a
482     *             <code>source</code> has not been set.
483     */
484    Future<? extends ParseResult> parse() throws IOException, IllegalStateException;
485}