public class GeneratorJob extends NutchTool implements Tool
Modifier and Type | Class and Description |
---|---|
static class |
GeneratorJob.SelectorEntry |
static class |
GeneratorJob.SelectorEntryComparator |
Modifier and Type | Field and Description |
---|---|
static java.lang.String |
BATCH_ID |
static java.lang.String |
GENERATE_COUNT |
static java.lang.String |
GENERATE_UPDATE_CRAWLDB |
static java.lang.String |
GENERATOR_COUNT_MODE |
static java.lang.String |
GENERATOR_COUNT_VALUE_DOMAIN |
static java.lang.String |
GENERATOR_COUNT_VALUE_HOST |
static java.lang.String |
GENERATOR_COUNT_VALUE_IP |
static java.lang.String |
GENERATOR_CUR_TIME |
static java.lang.String |
GENERATOR_DELAY |
static java.lang.String |
GENERATOR_FILTER |
static java.lang.String |
GENERATOR_MAX_COUNT |
static java.lang.String |
GENERATOR_MIN_SCORE |
static java.lang.String |
GENERATOR_NORMALISE |
static java.lang.String |
GENERATOR_RANDOM_SEED |
static java.lang.String |
GENERATOR_SITEMAP |
static java.lang.String |
GENERATOR_TOP_N |
protected static org.slf4j.Logger |
LOG |
currentJob, currentJobNum, numJobs, results, status
Constructor and Description |
---|
GeneratorJob() |
GeneratorJob(Configuration conf) |
Modifier and Type | Method and Description |
---|---|
java.lang.String |
generate(long topN,
long curTime,
boolean filter,
boolean norm,
boolean sitemap)
Mark URLs ready for fetching.
|
java.util.Collection<WebPage.Field> |
getFields(Job job) |
static void |
main(java.lang.String[] args) |
static java.lang.String |
randomBatchId()
Generates a random batch id
|
java.util.Map<java.lang.String,java.lang.Object> |
run(java.util.Map<java.lang.String,java.lang.Object> args)
Runs generator
|
int |
run(java.lang.String[] args)
Runs generator from commandline
|
getProgress, getStatus, killJob, stopJob
getConf, setConf
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
getConf, setConf
public static final java.lang.String GENERATE_UPDATE_CRAWLDB
public static final java.lang.String GENERATOR_MIN_SCORE
public static final java.lang.String GENERATOR_FILTER
public static final java.lang.String GENERATOR_NORMALISE
public static final java.lang.String GENERATOR_SITEMAP
public static final java.lang.String GENERATOR_MAX_COUNT
public static final java.lang.String GENERATOR_COUNT_MODE
public static final java.lang.String GENERATOR_COUNT_VALUE_DOMAIN
public static final java.lang.String GENERATOR_COUNT_VALUE_HOST
public static final java.lang.String GENERATOR_COUNT_VALUE_IP
public static final java.lang.String GENERATOR_TOP_N
public static final java.lang.String GENERATOR_CUR_TIME
public static final java.lang.String GENERATOR_DELAY
public static final java.lang.String GENERATOR_RANDOM_SEED
public static final java.lang.String BATCH_ID
public static final java.lang.String GENERATE_COUNT
protected static final org.slf4j.Logger LOG
public GeneratorJob()
public GeneratorJob(Configuration conf)
public java.util.Collection<WebPage.Field> getFields(Job job)
public static java.lang.String randomBatchId()
public java.util.Map<java.lang.String,java.lang.Object> run(java.util.Map<java.lang.String,java.lang.Object> args) throws java.lang.Exception
public java.lang.String generate(long topN, long curTime, boolean filter, boolean norm, boolean sitemap) throws java.lang.Exception
topN
- top threshold for maximum number of URLs permitted in a batchcurTime
- the current time in millisecondsfilter
- optional filtering of URLs within the generated batchnorm
- optional normalization of URls within the generated batchsitemap
- flag indicating whether a URL is a sitemap and hence processed accordinglyjava.lang.Exception
public int run(java.lang.String[] args) throws java.lang.Exception
public static void main(java.lang.String[] args) throws java.lang.Exception
java.lang.Exception
Copyright © 2019 The Apache Software Foundation