Introduction to the indexing method of Nutch integrated slor
/**
* build index
* @param solrUrl solr's web address
* @param crawlDb Crawling DB storage path: \crawl\crawldb
* @param linkDb Crawling link storage path: \crawl\linkdb
* @param segments metadata storage path: \crawl\segments
* @param noCommit whether to submit the slor server and the slor index
* @param deleteGone whether to delete outdated documents
* @param solrParams parameters of solr
* @param filter whether to enable URL filtering
* @param normalize whether to format the URL
* @throws IOException
*/
public void indexSolr(String solrUrl, Path crawlDb, Path linkDb,
List<Path> segments, boolean noCommit, boolean deleteGone, String solrParams,
boolean filter, boolean normalize) throws IOException {
...
IndexerMapReduce.initMRJob(crawlDb, linkDb, segments, job);
...
}
The indexing of Nutch is done through a MR.
The input of the map is the sequenceFile in the directory that Nutch crawled, the key is the URL that Nutch crawled, and the java generic type used by the value abstracts all the data types customized by Nutch into a NutchWritable object.
The data types contained in Nutchwritable are as follows:
CLASSES = new Class[] {
org.apache.hadoop.io.NullWritable.class,
org.apache.hadoop.io.BooleanWritable.class,
org.apache.hadoop.io.LongWritable.class,
org.apache.hadoop.io.BytesWritable.class,
org.apache.hadoop.io.FloatWritable.class,
org.apache.hadoop.io.IntWritable.class,
org.apache.hadoop.io.MapWritable.class,
org.apache.hadoop.io.Text.class,
org.apache.hadoop.io.MD5Hash.class,
org.apache.nutch.crawl.CrawlDatum.class,
org.apache.nutch.crawl.Inlink.class,
org.apache.nutch.crawl.Inlinks.class,
org.apache.nutch.fetcher.FetcherOutput.class,
org.apache.nutch.metadata.Metadata.class,
org.apache.nutch.parse.Outlink.class,
org.apache.nutch.parse.ParseText.class,
org.apache.nutch.parse.ParseData.class,
org.apache.nutch.parse.ParseImpl.class,
org.apache.nutch.parse.ParseStatus.class,
org.apache.nutch.protocol.Content.class,
org.apache.nutch.protocol.ProtocolStatus.class,
org.apache.nutch.scoring.webgraph.LinkDatum.class,
};
These data types abstract the data types of Nutch at various stages of crawling.
The map stage does not process the value, but only the URL. The processing code is as follows:
String urlString = filterUrl(normalizeUrl(key.toString()));
The call is to filter and format the URL according to the defined filtering rules. Of course, whether to perform this step can be set by the parameters when calling the command.
reduce is to process all crawled data, the code comments are as follows:
/**
* Output format: url as key, indexed action as value
*/
public void reduce(Text key, Iterator<NutchWritable> values,
OutputCollector<Text, NutchIndexAction> output, Reporter reporter)
throws IOException {
Inlinks inlinks = null;
CrawlDatum dbDatum = null;
CrawlDatum fetchDatum = null;
ParseData parseData = null;
ParseText parseText = null;
while (values.hasNext()) {
final Writable value = values.next().get(); // unwrap
//If it is the data type of URL injection
if (value instanceof Inlinks) {
inlinks = (Inlinks)value;
//If it is a crawled data type
} else if (value instanceof CrawlDatum) {
final CrawlDatum datum = (CrawlDatum)value;
//If the current data is in db injection state
if (CrawlDatum.hasDbStatus(datum)) {
dbDate = date;
}
//If the current data is in the crawling completion state.
else if (CrawlDatum.hasFetchStatus(datum)) {
// don't index unmodified (empty) pages
//Determine whether the crawling has been modified
if (datum.getStatus() != CrawlDatum.STATUS_FETCH_NOTMODIFIED) {
fetchDatum = datum;
/**
* Check if we need to delete 404 NOT FOUND and 301 PERMANENT REDIRECT.
*/
//If the parameter is set to delete to true, delete the wrong and outdated pages
if (delete) {
//If the crawled page expires, take the delete operation.
if (fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_GONE) {
reporter.incrCounter("IndexerStatus", "Documents deleted", 1);
NutchIndexAction action = new NutchIndexAction(null, NutchIndexAction.DELETE);
output.collect(key, action);
return;
}
//If the crawled page has been redirected to another page, delete the operation.
if (fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_PERM) {
reporter.incrCounter("IndexerStatus", "Perm redirects deleted", 1);
NutchIndexAction action = new NutchIndexAction(null, NutchIndexAction.DELETE);
output.collect(key, action);
return;
}
}
}
// URLs are discovered through other URLs || The page's signature || The page's metadata is generated by the parser
} else if (CrawlDatum.STATUS_LINKED == datum.getStatus() ||
CrawlDatum.STATUS_SIGNATURE == datum.getStatus() ||
CrawlDatum.STATUS_PARSE_META == datum.getStatus()) {
continue;
} else {
throw new RuntimeException("Unexpected status: "+datum.getStatus());
}
//if it is the parsed data type
} else if (value instanceof ParseData) {
parseData = (ParseData)value;
// Handle robots meta? https://issues.apache.org/jira/browse/NUTCH-1434
if (deleteRobotsNoIndex) {
// Get the robots meta data
String robotsMeta = parseData.getMeta("robots");
// Has it a noindex for this url?
if (robotsMeta != null && robotsMeta.toLowerCase().indexOf("noindex") != -1) {
// Delete it!
NutchIndexAction action = new NutchIndexAction(null, NutchIndexAction.DELETE);
output.collect(key, action);
return;
}
}
//The parsed Text file
} else if (value instanceof ParseText) {
parseText = (ParseText)value;
} else if (LOG.isWarnEnabled()) {
LOG.warn("Unrecognized type: "+value.getClass());
}
}
//If there is only a link, there is no record of crawling history or the crawling data is returned directly
if (fetchDatum == null || dbDatum == null
|| parseText == null || parseData == null) {
return; // only have inlinks
}
// Whether to skip DB_NOTMODIFIED pages
//If the page has been crawled but not repaired, skip it if skip is set in the incoming command.
if (skip && dbDatum.getStatus() == CrawlDatum.STATUS_DB_NOTMODIFIED) {
reporter.incrCounter("IndexerStatus", "Skipped", 1);
return;
}
//The page crawling is successful, but the parsing fails, return directly
if (!parseData.getStatus().isSuccess() ||
fetchDatum.getStatus() != CrawlDatum.STATUS_FETCH_SUCCESS) {
return;
}
NutchDocument doc = new NutchDocument();
//Get the metadata of the page from the parsed data
final Metadata metadata = parseData.getContentMeta();
// add segment, used to map from merged index back to segment files
doc.add("segment", metadata.get(Nutch.SEGMENT_NAME_KEY));
// page summary
// add digest, used by dedup
doc.add("digest", metadata.get(Nutch.SIGNATURE_KEY));
final Parse parse = new ParseImpl(parseText, parseData);
try {
// extract information from dbDatum and pass it to
// fetchDatum so that indexing filters can use it
final Text url = (Text) dbDatum.getMetaData().get(Nutch.WRITABLE_REPR_URL_KEY);
if (url != null) {
fetchDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, url);
}
// run indexing filters
// execute all filters
doc = this.filters.filter(doc, parse, key, fetchDatum, inlinks);
} catch (final IndexingException e) {
if (LOG.isWarnEnabled()) { LOG.warn("Error indexing "+key+": "+e); }
reporter.incrCounter("IndexerStatus", "Errors", 1);
return;
}
// skip documents discarded by indexing filters
if (doc == null) {
reporter.incrCounter("IndexerStatus", "Skipped by filters", 1);
return;
}
float boost = 1.0f;
// run scoring filters
//Execute the scoring filter
try {
boost = this.scfilters.indexerScore(key, doc, dbDatum,
fetchDatum, parse, inlinks, boost);
} catch (final ScoringFilterException e) {
if (LOG.isWarnEnabled()) {
LOG.warn("Error calculating score " + key + ": " + e);
}
return;
}
// Use the score as the weight of the document
// apply boost to all indexed fields.
doc.setWeight(boost);
// store boost for use by explain and dedup
doc.add("boost", Float.toString(boost));
reporter.incrCounter("IndexerStatus", "Documents added", 1);
NutchIndexAction action = new NutchIndexAction(doc, NutchIndexAction.ADD);
output.collect(key, action);
}
The current research is here, and the following is to be continued. .