[Java] 用java实现的电影天堂,飘花电影网的电影的下载地址抓取
1.之前看了一些论坛上有一个坛友用python写的抓取电影下载链接的,于是心血来潮的我也打算用java来写一个!其实并不是很难,下面附上代码
这是对电影天堂的电影的抓取的方法,(在此期间尝试设置代{过}{滤}理,以及用线程池,但貌似均没有成功) 说明下主要的jar包主要有httpclient4.5以及jsoup1.7
1.
2.下面的飘花电影网的,其实可以看到爬取的过程是大同小异的,只是选择器有所差别而已
最后附上成功的截图
最后一张是在网页上的应用
这是对电影天堂的电影的抓取的方法,(在此期间尝试设置代{过}{滤}理,以及用线程池,但貌似均没有成功) 说明下主要的jar包主要有httpclient4.5以及jsoup1.7
1.
[Java]
纯文本查看
复制代码
|
01
02
03
04
05
06
07
08
09
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
|
package
downloade;
import
java.io.File;
import
java.io.FileOutputStream;
import
java.io.IOException;
import
java.io.OutputStreamWriter;
import
java.util.HashMap;
import
java.util.Map;
import
java.util.concurrent.ExecutorService;
import
java.util.concurrent.Executors;
import
org.apache.http.HttpHost;
import
org.apache.http.HttpResponse;
import
org.apache.http.client.HttpClient;
import
org.apache.http.client.config.RequestConfig;
import
org.apache.http.client.methods.HttpGet;
import
org.apache.http.impl.client.HttpClients;
import
org.apache.http.util.EntityUtils;
import
org.jsoup.Jsoup;
import
org.jsoup.nodes.Document;
import
org.jsoup.nodes.Element;
import
org.jsoup.select.Elements;
import
com.sun.corba.se.spi.orbutil.threadpool.ThreadPool;
import
Pojo.DyUrl;
import
dao.JDBCUtils;
public
class
Dyttdownload {
static
int
id=
1
;
public
static
HttpClient client=
null
;
public
static
void
main(String[] args) {
//ExecutorService fixedThreadPool = Executors.newFixedThreadPool(10);
Map<Integer,String> map=
new
HashMap<>();
for
(
int
i=
1
;i<
50
;i++){
// http://www.ygdy8.net/html/gndy/dyzz/list_23_2.html
map.put(i,
"http://www.ygdy8.net/html/gndy/dyzz/list_23_"
+i+
".html"
);
}
for
(String string : map.values()) {
getUrl(string);
// Thread.currentThread().sleep(2000);
}
// getDownloadUrl("http://www.ygdy8.net/html/gndy/dyzz/20170926/55094.html");
}
public
static
void
getUrl(String uri){
JDBCUtils utils=
new
JDBCUtils();
try
{
client=HttpClients.createDefault();
// RequestConfig config=RequestConfig.custom().setProxy(new HttpHost("110.73.14.161",8123)).build();
HttpGet get=
new
HttpGet(uri);
// get.setConfig(config);
HttpResponse response=client.execute(get);
String result =EntityUtils.toString(response.getEntity(),
"gb2312"
);
Document doc=Jsoup.parse(result);
//css选择器
Elements elements= doc.select(
"table.tbspan "
);
for
(Element element : elements) {
element.setBaseUri(
"http://www.ygdy8.net"
);
DyUrl dy=getDownloadUrl(element.select(
"tr"
).get(
1
).select(
"a"
).text(),element.select(
"tr"
).get(
1
).select(
"a"
).attr(
"abs:href"
));
dy.setId(id);
utils.insert(dy);
id++;
}
}
catch
(IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public
static
DyUrl getDownloadUrl(String name,String dyurl){
DyUrl dy=
new
DyUrl();
// RequestConfig config=RequestConfig.custom().setProxy(new HttpHost("110.73.14.161",8123)).build();
try
{
client=HttpClients.createDefault();
HttpGet get =
new
HttpGet(dyurl);
// get.setConfig(config);
HttpResponse response=client.execute(get);
String result =EntityUtils.toString(response.getEntity(),
"gb2312"
);
Document doc=Jsoup.parse(result);
Elements elements=doc.select(
"div#Zoom table tr td a "
);
dy.setDyname(name);
dy.setDyUrl(elements.get(
0
).text());
}
catch
(IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return
dy;
}
}
|
2.下面的飘花电影网的,其实可以看到爬取的过程是大同小异的,只是选择器有所差别而已
[Java]
纯文本查看
复制代码
|
01
02
03
04
05
06
07
08
09
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
|
package
downloade;
import
java.io.IOException;
import
java.util.HashMap;
import
java.util.Map;
import
java.util.concurrent.ExecutorService;
import
java.util.concurrent.Executors;
import
org.apache.http.HttpHost;
import
org.apache.http.HttpResponse;
import
org.apache.http.client.HttpClient;
import
org.apache.http.client.methods.HttpGet;
import
org.apache.http.impl.client.CloseableHttpClient;
import
org.apache.http.impl.client.HttpClientBuilder;
import
org.apache.http.impl.client.HttpClients;
import
org.apache.http.util.EntityUtils;
import
org.jsoup.Jsoup;
import
org.jsoup.nodes.Document;
import
org.jsoup.nodes.Element;
import
org.jsoup.select.Elements;
import
Pojo.DyUrl;
import
dao.JDBCUtils;
public
class
piaohuadownload {
static
int
id=
1
;
public
static
HttpClient client=
null
;
public
static
void
main(String[] args) {
Map<Integer,String> map=
new
HashMap<>();
for
(
int
i=
16
;i<
50
;i++){
map.put(i,
"http://www.piaohua.com/html/dongzuo/list_"
+i+
".html"
);
}
for
(String string : map.values()) {
System.out.println(
"正在爬这个"
+string+
"网页"
);
// TODO Auto-generated method stub
getUrl(string);
}
}
public
static
void
getUrl(String uri){
JDBCUtils utils=
new
JDBCUtils();
try
{
client =HttpClientBuilder.create().build();
HttpResponse response=client.execute(
new
HttpGet(uri));
String result =EntityUtils.toString(response.getEntity(),
"utf-8"
);
Document doc=Jsoup.parse(result);
doc.setBaseUri(
"http://www.piaohua.com"
);
Elements elements=doc.select(
"#list dl"
);
for
(Element element : elements) {
String name=element.select(
"font"
).first().text();
String dyurl=element.select(
"a"
).first().absUrl(
"href"
);
DyUrl dy=getDownloadUrl(name, dyurl);
dy.setId(id);
utils.insert(dy);
id++;
}
}
catch
(IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public
static
DyUrl getDownloadUrl(String name,String dyurl){
DyUrl dUrl=
new
DyUrl();
try
{
client=HttpClients.createDefault();
HttpResponse response;
response = client.execute(
new
HttpGet(dyurl));
String result =EntityUtils.toString(response.getEntity(),
"utf-8"
);
Document doc=Jsoup.parse(result);
Elements elements=doc.select(
"#showinfo"
).select(
"a"
);
dUrl.setDyname(name);
dUrl.setDyUrl(elements.first().text());
}
catch
(IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return
dUrl;
}
}
|
最后附上成功的截图
最后一张是在网页上的应用