```
@
RequestMapping(value = "doGrab",method =
RequestMethod.POST)
public String doGrab(String username, String password, HttpServletRequest request){
try {
String cookie = simulationHttpUtil.getCookie(username,password);
String cookies[] = cookie.split("=");
webMagicUtil.setSite(cookies[1]);
webMagicUtil.setCook(cookie);
Spider.create(webMagicUtil)
//从该网页开始抓
.addUrl("
http://www.digifilm.com.cn/index.php/member/index")
.addPipeline(new ConsolePipeline())
//开启 5 个线程抓取
.thread(5)
//启动爬虫
.run();
System.out.print(webMagicUtil.getLength());
} catch (Exception e) {
e.printStackTrace();
}
@
Component //给爬虫供给 cookie 的方法
public String getCookie (String username,String password) throws Exception {
RequestConfig requestConfig = RequestConfig.custom().setCookieSpec(CookieSpecs.BEST_MATCH).build();//标准 cookie 策略 /*.STANDARD_STRICT*/
CloseableHttpClient httpClient = HttpClients.custom().setDefaultRequestConfig(requestConfig).build();//设置进去
HttpGet getHomePage = new HttpGet("
http://www.digifilm.com.cn/index.php/public/login");
getHomePage.setHeader("Accept","text/html,application/xhtml+xml,image/jxr,*/*");
getHomePage.setHeader("Accept-Encoding","gzip,deflate");
getHomePage.setHeader("Accept-Language","zh-CN");
getHomePage.setHeader("Connection","Keep-Alive");
getHomePage.setHeader("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393");
//填充登陆请求中基本的参数
CloseableHttpResponse response = httpClient.execute(getHomePage);
String rec = setCookie(response);
//printResponse(response);
//首页的源码
String responseHtml = EntityUtils.toString(response.getEntity());
//首页中的 html 代码<input name="__hash__" type="hidden" value=""/>
String hashValue = responseHtml.split("<input type=\"hidden\" name=\"__hash__\" value=\"")[1].split("\" />")[0];
response.close();
List<NameValuePair> valuePairs = new LinkedList<NameValuePair>();
valuePairs.add(new BasicNameValuePair("__hash__" , hashValue));
valuePairs.add(new BasicNameValuePair("password", password));
valuePairs.add(new BasicNameValuePair("username", username));
while (true){
//获取验证码"
HttpGet getCaptcha = new HttpGet("
http://www.digifilm.com.cn/index.php/Verify/verify/?rand=" + Math.random());
CloseableHttpResponse imageResponse = httpClient.execute(getCaptcha);
//把响应的 png 格式图片转换成 jpg 格式。
InputStream in = imageResponse.getEntity().getContent();
BufferedImage bufferedImage = imageUtil.imageChange(in);
imageResponse.close();
in.close();
//图片去噪
File file = imageUtil.cleanImage(bufferedImage);
//识别去噪后的图片
String text = scanCodeUtil.recognizeText(file);
System.out.println("扫描后的图片:"+text);
valuePairs.add(new BasicNameValuePair("verify", text));
//完成登陆请求的构造
UrlEncodedFormEntity entity = new UrlEncodedFormEntity(valuePairs, Consts.UTF_8);
HttpPost post = new HttpPost("
http://www.digifilm.com.cn/index.php/public/checklogin");
post.setEntity(entity);
CloseableHttpResponse httpResponse = httpClient.execute(post);//登录并返回响应对象
httpResponse.close();
//构造一个 get 请求,用来测试登录 cookie 是否拿到
HttpGet g = new HttpGet("
http://www.digifilm.com.cn/index.php/member/index");//获取登录后页面
//将 cookie 注入到 get 请求头当中。未得到 cookie 就会把请求头里的 cookie 清空。造成失败。
//可关闭的响应对象。
CloseableHttpResponse r = httpClient.execute(g);
Header headers= r.getFirstHeader("Content-Length");
Integer contentLength = Integer.parseInt(headers.getValue());
if(contentLength > 7000){
r.close();
break;
}
r.close();
}
//httpClient.close();
String rec2 = rec.split(";")[2];
return rec2;
}
}
@
Componentpublic class WebMagicUtil implements PageProcessor {
private int length;
//部分一:抓取网站的相关配置,包括编码、抓取间隔、重试次数、超时时间等
private Site site ;
public void setSite(String cookie) {
this.site =
Site.me().setRetryTimes(3).setSleepTime(1000).setTimeOut(1000*60*60).setCycleRetryTimes(3)
//添加 cookie 之前一定要先设置主机地址,否则 cookie 信息不生效
.setDomain("
www.digifilm.com.cn")
//添加获取的 cookie 信息;
.addCookie("PHPSESSID",cookie)
//添加请求头,网站会根据请求头判断该请求是由浏览器发起还是爬虫发起。
.addHeader("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393")
.addHeader("Accept","text/html, application/xhtml+xml, image/jxr, */*")
.addHeader("Accept-Encodin","gzip,deflate")
.addHeader("Accept-Language","zh-CN")
.addHeader("Connection","Keep-Alive");
//.addHeader("Referer","
http://www.digifilm.com.cn/index.php/public/login");;
}
@
Override //process 是定制爬虫逻辑的核心接口,在这里编写抽取逻辑
public void process(Page page) {
//在开始的页面抓(去到密钥列表和单个下载页面的连接)
if(page.getUrl().regex("(.*/index\\.php/member/index)").match()) {
page.addTargetRequests(page.getHtml().xpath("//div[@class=leaguer]").links().regex("(.*/index\\.php/(\\w+)_down/index)").all());
}
//在密钥列表页面抓(列表页码和单个下载页面的链接)
if (page.getUrl().regex("(.*/index\\.php/(\\w+)_down/index)").match()) {
page.addTargetRequests(page.getHtml().xpath("//div[@class='SMAMiddle SMAMiddlelb']").links().regex("(.*/index\\.php/(\\w+)_down/content/id/.*)").all());
//翻页链接
page.addTargetRequests(page.getHtml().xpath("//div[@class=fanye_1]").links().regex("(.*/index\\.php/(\\w+)_down/index\\?&p=\\d+)").all());
}
//在密钥单个下载页面抽取信息。
if (page.getUrl().regex("(.*/index\\.php/(\\w+)_down/content/id/\\w+)").match()){
page.putField("filmTitle", page.getHtml().xpath("//div[@class='videoDescri']/span[1]/text()"));
page.putField("filmSchedule", page.getHtml().xpath("//div[@class='videoDescri']/span[2]/text()"));
page.putField("filmType", page.getHtml().xpath("//div[@class='videoDescri']/span[3]/text()"));
page.putField("secretKey", page.getHtml().xpath("//div[@class='SMAMiddle SMAMiddlela']/ul/li/a[@class='load']").links().regex("(.*/download\\.php\\?mid=.*)").all());
List<String> list = page.getResultItems().get("secretKey");
for (String url: list) {
try {
System.out.println(url);
downloadFromUrl(url,"C:\\360Downloads\\Test\\");
} catch (Exception e) {
e.printStackTrace();
}
}
length++;
}
}
//测试下载代码
public static String downloadFromUrl(String url,String dir) {
try {
URL httpurl = new URL(url);
String fileName = getFileNameFromUrl(url);
System.out.println(fileName);
File saveDir = new File(dir);
if (!saveDir.exists()) {
saveDir.mkdir();
}
File file = new File(saveDir + File.separator + fileName);
file.createNewFile();
FileUtils.copyURLToFile( httpurl,file);
} catch (Exception e) {
e.printStackTrace();
return "Fault";
}
return "Successful!";
}
public static String getFileNameFromUrl(String url) {
String name = new Long(System.currentTimeMillis()).toString() + ".xml";
return name;
}
```