webmagic
webmagic copied to clipboard
我按照官方文档的代理方式,配置代理后。以多个线程启动爬虫,总是会出现ssl异常javax.net.ssl.SSLException: Received fatal alert: internal_error。单个线程不会
解决了吗?
没有
在 2021-09-17 14:17:18,"AzQiang97" @.***> 写道:
解决了吗?
— You are receiving this because you authored the thread. Reply to this email directly, view it on GitHub, or unsubscribe. Triage notifications on the go with GitHub Mobile for iOS or Android.
æ·»å ä¸é¢ä¸¤ä¸ªjavaæ件人åæç¨ä¾çmainæ¹æ³æ¢æ第ä¸ä¸ªæä»¶å°±å¥½äº `package com.ygq.demo01.utils;
import java.io.IOException; import java.nio.charset.Charset; import java.util.HashMap; import java.util.Map;
import org.apache.commons.io.IOUtils; import org.apache.http.HttpResponse; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.util.EntityUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.downloader.AbstractDownloader; import us.codecraft.webmagic.downloader.HttpClientRequestContext; import us.codecraft.webmagic.downloader.HttpUriRequestConverter; import us.codecraft.webmagic.proxy.Proxy; import us.codecraft.webmagic.proxy.ProxyProvider; import us.codecraft.webmagic.selector.PlainText; import us.codecraft.webmagic.utils.CharsetUtils; import us.codecraft.webmagic.utils.HttpClientUtils;
/**
-
The http downloader based on HttpClient.
-
@author [email protected]
-
@since 0.1.0 */ public class HttpClientDownloader extends AbstractDownloader {
private Logger logger = LoggerFactory.getLogger(getClass());
private final Map<String, CloseableHttpClient> httpClients = new HashMap<String, CloseableHttpClient>();
private HttpClientGenerator httpClientGenerator = new HttpClientGenerator();
private HttpUriRequestConverter httpUriRequestConverter = new HttpUriRequestConverter();
private ProxyProvider proxyProvider;
private boolean responseHeader = true;
public void setHttpUriRequestConverter(HttpUriRequestConverter httpUriRequestConverter) { this.httpUriRequestConverter = httpUriRequestConverter; }
public void setProxyProvider(ProxyProvider proxyProvider) { this.proxyProvider = proxyProvider; }
private CloseableHttpClient getHttpClient(Site site) { if (site == null) { return httpClientGenerator.getClient(null); } String domain = site.getDomain(); CloseableHttpClient httpClient = httpClients.get(domain); if (httpClient == null) { synchronized (this) { httpClient = httpClients.get(domain); if (httpClient == null) { httpClient = httpClientGenerator.getClient(site); httpClients.put(domain, httpClient); } } } return httpClient; }
public Page download(Request request, Task task) { if (task == null || task.getSite() == null) { throw new NullPointerException("task or site can not be null"); } CloseableHttpResponse httpResponse = null; CloseableHttpClient httpClient = getHttpClient(task.getSite()); Proxy proxy = proxyProvider != null ? proxyProvider.getProxy(task) : null; HttpClientRequestContext requestContext = httpUriRequestConverter.convert(request, task.getSite(), proxy); Page page = Page.fail(); try { httpResponse = httpClient.execute(requestContext.getHttpUriRequest(), requestContext.getHttpClientContext()); page = handleResponse(request, request.getCharset() != null ? request.getCharset() : task.getSite().getCharset(), httpResponse, task); onSuccess(request); logger.info("downloading page success {}", request.getUrl()); return page; } catch (IOException e) { logger.warn("download page {} error", request.getUrl(), e); onError(request); return page; } finally { if (httpResponse != null) { //ensure the connection is released back to pool EntityUtils.consumeQuietly(httpResponse.getEntity()); } if (proxyProvider != null && proxy != null) { proxyProvider.returnProxy(proxy, page, task); } } }
public void setThread(int thread) { httpClientGenerator.setPoolSize(thread); }
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException { byte[] bytes = IOUtils.toByteArray(httpResponse.getEntity().getContent()); String contentType = httpResponse.getEntity().getContentType() == null ? "" : httpResponse.getEntity().getContentType().getValue(); Page page = new Page(); page.setBytes(bytes); if (!request.isBinaryContent()){ if (charset == null) { charset = getHtmlCharset(contentType, bytes); } page.setCharset(charset); page.setRawText(new String(bytes, charset)); } page.setUrl(new PlainText(request.getUrl())); page.setRequest(request); page.setStatusCode(httpResponse.getStatusLine().getStatusCode()); page.setDownloadSuccess(true); if (responseHeader) { page.setHeaders(HttpClientUtils.convertHeaders(httpResponse.getAllHeaders())); } return page; }
private String getHtmlCharset(String contentType, byte[] contentBytes) throws IOException { String charset = CharsetUtils.detectCharset(contentType, contentBytes); if (charset == null) { charset = Charset.defaultCharset().name(); logger.warn("Charset autodetect failed, use {} as charset. Please specify charset in Site.setCharset()", Charset.defaultCharset()); } return charset; } }
import java.io.IOException; import java.security.KeyManagementException; import java.security.NoSuchAlgorithmException; import java.security.cert.CertificateException; import java.security.cert.X509Certificate; import java.util.Map;
import javax.net.ssl.SSLContext; import javax.net.ssl.TrustManager; import javax.net.ssl.X509TrustManager;
import org.apache.commons.lang3.JavaVersion; import org.apache.commons.lang3.SystemUtils; import org.apache.http.HttpException; import org.apache.http.HttpRequest; import org.apache.http.HttpRequestInterceptor; import org.apache.http.client.CookieStore; import org.apache.http.config.Registry; import org.apache.http.config.RegistryBuilder; import org.apache.http.config.SocketConfig; import org.apache.http.conn.socket.ConnectionSocketFactory; import org.apache.http.conn.socket.PlainConnectionSocketFactory; import org.apache.http.conn.ssl.DefaultHostnameVerifier; import org.apache.http.conn.ssl.SSLConnectionSocketFactory; import org.apache.http.impl.client.BasicCookieStore; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.DefaultHttpRequestRetryHandler; import org.apache.http.impl.client.HttpClientBuilder; import org.apache.http.impl.client.HttpClients; import org.apache.http.impl.conn.PoolingHttpClientConnectionManager; import org.apache.http.impl.cookie.BasicClientCookie; import org.apache.http.protocol.HttpContext; import org.slf4j.Logger; import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.Site; import us.codecraft.webmagic.downloader.CustomRedirectStrategy;
/**
-
@author [email protected]
-
@since 0.4.0 */ public class HttpClientGenerator {
private transient Logger logger = LoggerFactory.getLogger(getClass());
private PoolingHttpClientConnectionManager connectionManager;
public HttpClientGenerator() { Registry<ConnectionSocketFactory> reg = RegistryBuilder.<ConnectionSocketFactory>create() .register("http", PlainConnectionSocketFactory.INSTANCE) .register("https", buildSSLConnectionSocketFactory()) .build(); connectionManager = new PoolingHttpClientConnectionManager(reg); connectionManager.setDefaultMaxPerRoute(100); }
private SSLConnectionSocketFactory buildSSLConnectionSocketFactory() { try { return new SSLConnectionSocketFactory(createIgnoreVerifySSL(), new String[]{"SSLv3", "TLSv1", "TLSv1.1", "TLSv1.2"}, null, new DefaultHostnameVerifier()); // ä¼å ç»è¿å®å ¨è¯ä¹¦ } catch (KeyManagementException e) { logger.error("ssl connection fail", e); } catch (NoSuchAlgorithmException e) { logger.error("ssl connection fail", e); } return SSLConnectionSocketFactory.getSocketFactory();
}
private SSLContext createIgnoreVerifySSL() throws NoSuchAlgorithmException, KeyManagementException { // å®ç°ä¸ä¸ªX509TrustManageræ¥å£ï¼ç¨äºç»è¿éªè¯ï¼ä¸ç¨ä¿®æ¹éé¢çæ¹æ³ X509TrustManager trustManager = new X509TrustManager() {
public void checkClientTrusted(X509Certificate[] chain, String authType) throws CertificateException { } public void checkServerTrusted(X509Certificate[] chain, String authType) throws CertificateException { } public X509Certificate[] getAcceptedIssuers() { return null; } }; SSLContext sc = SSLContext.getInstance("TLS"); sc.init(null, new TrustManager[]{trustManager}, null); return sc;
}
public HttpClientGenerator setPoolSize(int poolSize) { connectionManager.setMaxTotal(poolSize); return this; }
public CloseableHttpClient getClient(Site site) { return generateClient(site); }
private CloseableHttpClient generateClient(Site site) { HttpClientBuilder httpClientBuilder = HttpClients.custom();
httpClientBuilder.setConnectionManager(connectionManager); if (site.getUserAgent() != null) { httpClientBuilder.setUserAgent(site.getUserAgent()); } else { httpClientBuilder.setUserAgent(""); } if (site.isUseGzip()) { httpClientBuilder.addInterceptorFirst(new HttpRequestInterceptor() { public void process( final HttpRequest request, final HttpContext context) throws HttpException, IOException { if (!request.containsHeader("Accept-Encoding")) { request.addHeader("Accept-Encoding", "gzip"); } } }); } //解å³post/redirect/post 302跳转é®é¢ httpClientBuilder.setRedirectStrategy(new CustomRedirectStrategy()); SocketConfig.Builder socketConfigBuilder = SocketConfig.custom(); socketConfigBuilder.setSoKeepAlive(true).setTcpNoDelay(true); socketConfigBuilder.setSoTimeout(site.getTimeOut()); SocketConfig socketConfig = socketConfigBuilder.build(); httpClientBuilder.setDefaultSocketConfig(socketConfig); connectionManager.setDefaultSocketConfig(socketConfig); httpClientBuilder.setRetryHandler(new DefaultHttpRequestRetryHandler(site.getRetryTimes(), true)); generateCookie(httpClientBuilder, site); return httpClientBuilder.build();
}
private void generateCookie(HttpClientBuilder httpClientBuilder, Site site) { if (site.isDisableCookieManagement()) { httpClientBuilder.disableCookieManagement(); return; } CookieStore cookieStore = new BasicCookieStore(); for (Map.Entry<String, String> cookieEntry : site.getCookies().entrySet()) { BasicClientCookie cookie = new BasicClientCookie(cookieEntry.getKey(), cookieEntry.getValue()); cookie.setDomain(site.getDomain()); cookieStore.addCookie(cookie); } for (Map.Entry<String, Map<String, String>> domainEntry : site.getAllCookies().entrySet()) { for (Map.Entry<String, String> cookieEntry : domainEntry.getValue().entrySet()) { BasicClientCookie cookie = new BasicClientCookie(cookieEntry.getKey(), cookieEntry.getValue()); cookie.setDomain(domainEntry.getKey()); cookieStore.addCookie(cookie); } } httpClientBuilder.setDefaultCookieStore(cookieStore); }
}
package com.ygq.demo01.domain;
import com.ygq.demo01.controller.GithubRepoPageProcessor; import com.ygq.demo01.utils.HttpClientDownloader; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.model.annotation.ExtractBy; import us.codecraft.webmagic.model.annotation.ExtractByUrl; import us.codecraft.webmagic.model.annotation.HelpUrl; import us.codecraft.webmagic.model.annotation.TargetUrl; import us.codecraft.webmagic.pipeline.Pipeline;
@TargetUrl("https://github.com/\w+/\w+") @HelpUrl("https://github.com/\w+") public class GithubRepo {
@ExtractBy(value = "//h1[@class='entry-title public']/strong/a/text()", notNull = true)
private String name;
@ExtractByUrl("https://github\\.com/(\\w+)/.*")
private String author;
@ExtractBy("//div[@id='readme']/tidyText()")
private String readme;
public static void main(String[] args) {
// OOSpider.create(Site.me().setSleepTime(1000) // , new ConsolePageModelPipeline(), GithubRepo.class) // .addUrl("https://github.com/code4craft").thread(5).run(); Spider.create(new GithubRepoPageProcessor()).setDownloader(new HttpClientDownloader()).addUrl("https://github.com/code4craft").thread(5).run(); // OOSpider.create(new GithubRepoPageProcessor()).setDownloader(new HttpClientDownloader()).addUrl("https://github.com/code4craft").thread(5).run(); } }`