webmagic
webmagic copied to clipboard
建议:给request指定下载成功后用什么方法执行
我在现在版本的基础上增加了一些建议:在Request上添加一个字段,用于自定在这个request下载html完成后用什么方法进行解析: 实现的Request: ` import us.codecraft.webmagic.Request;
public class MyRequest extends Request { public String tartMethod;
public MyRequest(String url) {
super(url);
}
public MyRequest(String url, String tartMethod) {
this(url);
this.tartMethod = tartMethod;
}
public String getTartMethod() {
return tartMethod;
}
public void setTartMethod(String tartMethod) {
this.tartMethod = tartMethod;
}
}
新增加的注解:
` import java.lang.annotation.*;
@Target(ElementType.METHOD) @Retention(RetentionPolicy.RUNTIME) public @interface RequestProcessor { public String value() default "";
}
实现PageProcessor接口:
@Slf4j
public abstract class AnnoPageProcessor implements PageProcessor {
private Map<String,Method> handlerMethods = new HashMap<>();
private Spider spider;
public Spider getSpider() {
return spider;
}
public void setSpider(Spider spider) {
this.spider = spider;
}
public abstract void handler(Page page);
public AnnoPageProcessor() throws Exception{
initTargetMethods();
}
protected void initTargetMethods() throws Exception{
Method[] methods = this.getClass().getDeclaredMethods();
for (int i = 0; i < methods.length; i++) {
methods[i].setAccessible(true);
RequestProcessor annotation = methods[i].getAnnotation(RequestProcessor.class);
if(null==annotation) continue;
if(handlerMethods.containsKey(annotation.value().trim())){
throw new Exception("Multiple handlerMethod:"+annotation.value().trim());
};
handlerMethods.put(annotation.value().trim(),methods[i]);
}
}
@Override
public void process(Page page) {
if(!(page.getRequest() instanceof MyRequest)){//普通请求
handler(page);
return;
}
MyRequest request = (MyRequest) page.getRequest();
if(request.getTartMethod()!=null && !request.getTartMethod().trim().equals("")){
Method method = handlerMethods.get(request.tartMethod);
if(method == null){
log.error("shut down spidier,beacuse no such targetMethod: "+request.getTartMethod().trim());
spider.stop();
return;
}
try {
method.invoke(this,page);
} catch (Exception e) {
e.printStackTrace();
}
}else {
handler(page);
}
}
} `
爬虫案例: ` @Component @Slf4j public class BtJiaProcessor extends AnnoPageProcessor { private Site site = Site.me().setRetryTimes(3).setSleepTime(500);
public BtJiaProcessor() throws Exception {
super();
}
@Override
public void handler(Page page) {
String links = page.getHtml().links().get();
page.addTargetRequest(new MyRequest(links,"nextHandler"));
}
@RequestProcessor("nextHandler")
public void nextHandler(Page page){
log.info("now begin do it");
}
@Override
public Site getSite() {
return site;
}
`
因为spider类的onDownloadSuccess()方法是私有的,无法重写,因此只能通过实现PageProcess接口来实现路由处理方法的逻辑