1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 18 package org.apache.any23.extractor; 19 20 import org.eclipse.rdf4j.model.IRI; 21 import org.w3c.dom.Document; 22 23 import java.io.IOException; 24 import java.io.InputStream; 25 26 /** 27 * It defines the signature of a generic Extractor. 28 * 29 * @param <Input> 30 * the type of the input data to be processed. 31 */ 32 public interface Extractor<Input> { 33 34 /** 35 * This interface specializes an {@link Extractor} able to handle {@link java.net.URI} as input format. Use it if 36 * you need to fetch a document before the extraction 37 */ 38 public interface BlindExtractor extends Extractor<IRI> { 39 } 40 41 /** 42 * This interface specializes an {@link Extractor} able to handle {@link java.io.InputStream} as input format. 43 */ 44 public interface ContentExtractor extends Extractor<InputStream> { 45 46 /** 47 * If <code>true</code>, the extractor will stop at first parsing error, if<code>false</code> the extractor will 48 * attempt to ignore all parsing errors. 49 * 50 * @param f 51 * tolerance flag. 52 */ 53 void setStopAtFirstError(boolean f); 54 55 } 56 57 /** 58 * This interface specializes an {@link Extractor} able to handle {@link org.w3c.dom.Document} as input format. 59 */ 60 public interface TagSoupDOMExtractor extends Extractor<Document> { 61 } 62 63 /** 64 * Executes the extractor. Will be invoked only once, extractors are not reusable. 65 * 66 * @param extractionParameters 67 * the parameters to be applied during the extraction. 68 * @param context 69 * The document context. 70 * @param in 71 * The extractor input data. 72 * @param out 73 * the collector for the extracted data. 74 * 75 * @throws IOException 76 * On error while reading from the input stream. 77 * @throws ExtractionException 78 * On other error, such as parse errors. 79 */ 80 void run(ExtractionParameters extractionParameters, ExtractionContext context, Input in, ExtractionResult out) 81 throws IOException, ExtractionException; 82 83 /** 84 * Returns a {@link ExtractorDescription} of this extractor. 85 * 86 * @return the object representing the extractor description. 87 */ 88 ExtractorDescription getDescription(); 89 90 }