An example processor
package org.archive.crawler.extractor;
import java.util.regex.Matcher;
import javax.management.AttributeNotFoundException;
import org.archive.crawler.datamodel.CoreAttributeConstants;
import org.archive.crawler.datamodel.CrawlURI;
import org.archive.crawler.framework.Processor;
import org.archive.crawler.settings.SimpleType;
import org.archive.crawler.settings.Type;
import org.archive.crawler.extractor.Link;
import org.archive.util.TextUtils;
/**
* A very simple extractor. Will assume that any string that matches a
* configurable regular expression is a link.
*
* @author Kristinn Sigurdsson
*/
public class SimpleExtractor extends Processor
implements CoreAttributeConstants
{
public static final String ATTR_REGULAR_EXPRESSION = "input-param";
public static final String DEFAULT_REGULAR_EXPRESSION =
"http://([a-zA-Z0-9]+\\.)+[a-zA-Z0-9]+/"; //Find domains
int numberOfCURIsHandled = 0;
int numberOfLinksExtracted = 0;
public SimpleExtractor(String name) { 1
super(name, "A very simple link extractor. Doesn't do anything useful.");
Type e;
e = addElementToDefinition(new SimpleType(ATTR_REGULAR_EXPRESSION,
"How deep to look into files for URI strings, in bytes",
DEFAULT_REGULAR_EXPRESSION));
e.setExpertSetting(true);
}
protected void innerProcess(CrawlURI curi) {
if (!curi.isHttpTransaction()) 2
{
// We only handle HTTP at the moment.
return;
}
numberOfCURIsHandled++; 3
CharSequence cs = curi.getHttpRecorder().getReplayCharSequence(); 4
String regexpr = null;
try {
regexpr = (String)getAttribute(ATTR_REGULAR_EXPRESSION,curi); 5
} catch(AttributeNotFoundException e) {
regexpr = DEFAULT_REGULAR_EXPRESSION;
}
Matcher match = TextUtils.getMatcher(regexpr, cs); 6
while (match.find()){
String link = cs.subSequence(match.start(),match.end()).toString(); 7
curi.createAndAddLink(link, Link.SPECULATIVE_MISC, Link.NAVLINK_HOP);8
numberOfLinksExtracted++; 9
System.out.println("SimpleExtractor: " + link); 10
}
TextUtils.recycleMatcher(match); 11
}
public String report() { 12
StringBuffer ret = new StringBuffer();
ret.append("Processor: org.archive.crawler.extractor." +
"SimpleExtractor\n");
ret.append(" Function: Example extractor\n");
ret.append(" CrawlURIs handled: " + numberOfCURIsHandled + "\n");
ret.append(" Links extracted: " + numberOfLinksExtracted + "\n\n");
return ret.toString();
}
} 相关推荐
Lzs 2020-10-23
聚合室 2020-11-16
零 2020-09-18
Justhavefun 2020-10-22
ChaITSimpleLove 2020-10-06
周游列国之仕子 2020-09-15
afanti 2020-09-16
88234852 2020-09-15
YClimb 2020-09-15
风雨断肠人 2020-09-04
卖口粥湛蓝的天空 2020-09-15
stulen 2020-09-15
pythonxuexi 2020-09-06
abfdada 2020-08-26
梦的天空 2020-08-25