小小爬虫

为了每天看到《权利的游戏》的更新,我决定写个爬虫,用crontab定时任务每天给我发个邮件。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
package crawler;

import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;

import javax.mail.*;
import javax.mail.internet.InternetAddress;
import javax.mail.internet.MimeMessage;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.Properties;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
* Created by hero on 17-8-13.
*/
public class Dianying {

private static String crawlWeb(String url, String charset, String regex) throws IOException {
StringBuilder sb = new StringBuilder();
CloseableHttpClient httpClient = HttpClients.createDefault();
HttpGet httpGet = new HttpGet(url);
Pattern pattern = Pattern.compile(regex);
CloseableHttpResponse response = null;
BufferedReader reader = null;
Matcher matcher;
try {
httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.93 Safari/537.36");
httpGet.setHeader("Connection", "keep-alive");
httpGet.setHeader("Referer", "http://www.a.com");
// httpGet.setHeader("Content-type", "text/xml; charset=UTF-8");
response = httpClient.execute(httpGet);
HttpEntity entity = response.getEntity();
reader = new BufferedReader(new InputStreamReader(entity.getContent(), charset), 4096);
String line;
while ((line = reader.readLine()) != null) {
//System.out.println("line---" + line);
matcher = pattern.matcher(line);
if (matcher.find()) {
System.out.println(matcher.group());
sb.append("<h3>");
sb.append(matcher.group());
sb.append("</h3>");
}
}
EntityUtils.consume(entity);
} catch (Exception ex) {
ex.printStackTrace();
} finally {
if (response != null)
response.close();
if (reader != null)
reader.close();
}
return sb.toString();
}

private static void sendEmail(String subject, String content) {
String to = "zxfspace@163.com";
String from = "zxfspace@163.com";
Properties properties = System.getProperties();
properties.setProperty("mail.transport.protocol", "smtp");
properties.setProperty("mail.smtp.host", "smtp.163.com");
properties.setProperty("mail.smtp.auth", "true");
Session session = Session.getDefaultInstance(properties);
try {
MimeMessage message = new MimeMessage(session);
message.setFrom(new InternetAddress(from));
message.addRecipient(Message.RecipientType.TO, new InternetAddress(to));
message.setSubject(subject);
message.setContent(content, "text/html;charset=UTF-8");
Transport transport = session.getTransport();
transport.connect("zxfspace@163.com", "密码");
transport.sendMessage(message, message.getAllRecipients());
} catch (Exception ex) {
ex.printStackTrace();
}
}

public static void main(String[] args) {
try {
String res = crawlWeb("http://www.dy2018.com/i/98194.html", "gb2312", "ftp://m:m@tv.dl1234.com:2199/权力的游戏第七季[0-9]{2}.{0,2}\\.mp4");
System.out.println(res);
sendEmail("权利的游戏7更新提示", res);
} catch (Exception ex) {
ex.printStackTrace();
}
}
}
1
2
3
4
5
6
7
8
9
10
11
12
13
#!/bin/bash

export PATH=/opt/maven/bin:/opt/node/bin:/usr/lib/jdk/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin
export JAVA_HOME=/usr/lib/jdk
export JRE_HOME=${JAVA_HOME}/jre
export CLASSPATH=.:${JAVA_HOME}/lib:${JRE_HOME}/lib
export PATH=${JAVA_HOME}/bin:$PATH

classp="/home/hero/tmp/crawler"

cd /home/hero/tmp/crawler

$JAVA_HOME/bin/java -classpath ".:$classp/commons-logging-1.2.jar:$classp/httpclient-4.5.3.jar:$classp/httpcore-4.4.6.jar:$classp/mail-1.4.7.jar" crawler.Dianying

我还写了一个邪恶的脚本,它可以把某个网站某一类电影按照评分从高到低列出来。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
#!/bin/bash

# 邪恶的爬虫

startTime=`date`

URL="https://www.xxx.com"
START_PAGE_INDEX=1
END_PAGE_INDEX=`expr $START_PAGE_INDEX + 5`
echo $END_PAGE_INDEX

extractVideo () {
FILE=$1
FINAL=result-$FILE

for i in `cat $FILE`;do
page=`curl "$i"`
rate=`echo $page | grep -o 'percent">[0-9]\{1,3\}' | grep -o '[0-9]\{1,3\}'`
if (( rate > 60 )); then
echo "$rate $i" >> $FINAL
fi
done
echo "finish"
}

day=`date "+%Y%m%d"`
if [ -d $day ];then
rm -rf $day
fi
mkdir $day && cd $day

for((i=$START_PAGE_INDEX;i<$END_PAGE_INDEX;i++));do

videos=`curl "https://www.xxx.com/video?c=111&page=$i" | grep -o "\/view_video\.php?viewkey=[a-z0-9]\{1,20\}" | uniq`
for j in $videos;do
echo "$URL"$j >> f$i
done
(extractVideo f$i &)
done

while (( `ps -ef | grep curl | wc -l` > 1 ));do
sleep 30s
done

cat result* | sort -nr -k1 | uniq > list

find . -type f -name "result*" -delete
find . -type f -name "f*" -delete

echo "start time --- "$startTime
echo "end time ---"`date`

以上两种都只是针对简单的网站,它们对爬虫没怎么限制,网页也有规律,所以简单的http请求+正则就可以搞定。高级的需要用到python了。
Requests: HTTP for Humans: 模拟浏览器发送http请求
Beautiful Soup Documentation:解析HTML and XML文件

理解概念很重要
如何区分不同用户——Cookie/Session机制详解

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
#!/usr/bin/python
# -*- coding: UTF-8 -*-

import codecs
import requests

url = "http://xxx.xxx.com/index.aspx"
headers = {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8','Accept-Encoding':'gzip, deflate','Accept-Language':'en-US,en;q=0.8,zh-CN;q=0.6,zh;q=0.4,zh-TW;q=0.2','Cache-Control':'max-age=0','Connection':'keep-alive','Content-Length':'559','Content-Type':'application/x-www-form-urlencoded','Host':'eds.topcheer.com','Origin':'http://eds.topcheer.com','Referer':'http://eds.topcheer.com/index.aspx','Upgrade-Insecure-Requests':'1','User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36'}
cookies = {'LanguageInfo=Langange':'_CN','ASP.NET_SessionId':'fjejbx3vesfiat450lzeit55','EdsU001':'','EdsU002':'','EdsU003':'','EdsU004':'','EdsU005':'','EdsU006':'ssY2VaPluVl1PV9B5OzZtg=='}
payload = {'__EVENTTARGET':'','__EVENTARGUMENT':'','__VIEWSTATE':'/wEPDwUKMTY0MzUwMjAyNQ9kFgQCAw9kFgYCAQ8WAh4EVGV4dAUJ55So5oi3SUQ6ZAIFDxYCHwAFB+WvhueggTpkAgkPDxYCHwAFEeW/mOiusOWvhuegge+8nz4+FgIeB29uY2xpY2sFdWphdmFzY3JpcHQ6dmFyIHdpbj13aW5kb3cub3BlbignR2V0UFNXLmFzcHgnLG51bGwsJ3Jlc2l6YWJsZT15ZXMgLHNjcm9sbGJhcnM9eWVzLHdpZHRoPTU1MCxoZWlnaHQ9NDAwJyk7cmV0dXJuIGZhbHNlO2QCBQ8WAh8AZWRkg3l1u9Gt0dqFY3OJWoChL1bjCBU=','__VIEWSTATEGENERATOR':'90059987','__EVENTVALIDATION':'/wEWBQKM5P+GDAKRi8C1BAK3jsrkBALP37PRCQKlwImNC2jwVzQYZPX2pUZ3j92w8YlMso55','tbUserId':'','tbPassword':'','ctl04':''}

f = codecs.open("login.html", "w", "utf-8")

r = requests.post(url, headers=headers, data=payload, cookies=cookies)
print r.text
print r.status_code
print r.headers

f.write(r.text)
f.close()

python代替java

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
#!/usr/bin/python
# -*- coding: UTF-8 -*-

import codecs
import smtplib
from email.Header import Header
from email.mime.text import MIMEText
import requests
from bs4 import BeautifulSoup
import re

headers = {'Accept':'text/html','Accept-Encoding':'gzip, deflate','Accept-Language':'en-US,en;q=0.8,zh-CN;q=0.6,zh;q=0.4,zh-TW;q=0.2','Cache-Control':'max-age=0','Connection':'keep-alive','Host':'www.dy2018.com','Referer':'http://www.dy2018.com/','Upgrade-Insecure-Requests':'1','User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, likeGecko) Chrome/60.0.3112.101 Safari/537.36'}
r = requests.get("http://www.dy2018.com/i/98194.html", headers = headers)
r.encoding = 'gb2312'
f = codecs.open('icefire', 'w+', 'utf-8')
soup = BeautifulSoup(r.text, 'html.parser')
def mp4(href):
return href and re.compile("mp4").search(href)
txt = ''
for link in soup.find_all(href=mp4):
txt = txt + link.get('href') + '\n'
print txt.encode('utf-8').strip()
f.close()

## 发送邮件
msg = MIMEText(txt, 'plain', 'utf-8')
msg['Subject'] = Header('权利的游戏7', 'utf-8')
msg['From'] = 'zxfspace@163.com'
msg['To'] = 'zxfspace@163.com'
s = smtplib.SMTP('smtp.163.com')
s.login('zxfspace@163.com', 'password')
s.sendmail('zxfspace@163.com', 'zxfspace@163.com', msg.as_string())
s.quit()

被玩烂的知乎
参考

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
#!/usr/bin/python
# -*- coding: UTF-8 -*-

import requests
from http import cookiejar
from bs4 import BeautifulSoup
from PIL import Image
import pytesseract
import time
import codecs
import os

USERNAME = ''
PASSWORD = ''

headers = {
'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',
'Host':'www.zhihu.com',
'Referer':'https://www.zhihu.com/',
'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36',
}

def get_xsrf():
r = s.get('https://www.zhihu.com', headers=headers)
s.cookies.save()
print 'get_xsrf status code : %s' % r.status_code
soup = BeautifulSoup(r.text, 'html.parser')
xsrf = soup.find(attrs={"name":"_xsrf"})
print "_xsrf=%s" % xsrf['value']
return xsrf['value']
def get_cookies():
cj = cookiejar.LWPCookieJar(filename='cookies_zhihu')
return cj
try:
cj.load()
except:
print "load cookie failed."
exit()
def get_captcha():
t = str(int(time.time() * 1000))
captcha_url = 'https://www.zhihu.com/captcha.gif?r=' + t + "&type=login"
r = s.get(captcha_url, headers=headers)
with open('captcha.jpg', 'wb') as f:
f.write(r.content)
f.close()
# 用pillow 的 Image 显示验证码
# 如果没有安装 pillow 到源代码所在的目录去找到验证码然后手动输入
try:
im = Image.open('captcha.jpg')
im.show()
im.close()
except:
print(u'请到 %s 目录找到captcha.jpg 手动输入' % os.path.abspath('captcha.jpg'))
captcha = raw_input("please input the captcha:")
return captcha
def login():
url = 'https://www.zhihu.com/login/email'
data = {
'_xsrf':get_xsrf(),
'password':PASSWORD,
'email':USERNAME,
'captcha':get_captcha()
}
r = s.post(url, headers=headers,data=data)
r.encoding='utf-8'
print r.text
print r.status_code
s.cookies.save(ignore_discard=True)

s = requests.Session()
s.cookies = get_cookies()
login()
r = s.get('https://www.zhihu.com/', headers=headers)
print r.text

取上个月加班列表

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
#!/usr/bin/python
# -*- coding: UTF-8 -*-

import requests
from http import cookiejar
from bs4 import BeautifulSoup
from PIL import Image
import pytesseract
import time
import datetime
import codecs
import os

headers = {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8','Accept-Encoding':'gzip, deflate','Accept-Language':'en-US,en;q=0.8,zh-CN;q=0.6,zh;q=0.4,zh-TW;q=0.2','Cache-Control':'max-age=0','Connection':'keep-alive','Content-Length':'559','Content-Type':'application/x-www-form-urlencoded','Host':'eds.topcheer.com','Upgrade-Insecure-Requests':'1','User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36'}
h1 = {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8','Accept-Encoding':'gzip, deflate','Accept-Language':'en-US,en;zh-CN','Connection':'keep-alive','DNT':'1','Host':'eds.topcheer.com','Upgrade-Insecure-Requests':'1','User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}

class DynamicKey:
VIEWSTATE = '';
VIEWSTATEGENERATOR = '';
EVENTVALIDATION = '';
def __init__(self, vs, vsg, ev):
self.VIEWSTATE = vs;
self.VIEWSTATEGENERATOR = vsg;
self.EVENTVALIDATION = ev;
def __str__(self):
return "{VIEWSTATE:%s,VIEWSTATEGENERATOR:%s,EVENTVALIDATION:%s}" % (self.VIEWSTATE, self.VIEWSTATEGENERATOR, self.EVENTVALIDATION)

def get_cookies():
cj = cookiejar.LWPCookieJar(filename='cookies_eds')
return cj
try:
cj.load()
except:
print "load cookie failed."
exit()
def get_dynamickey(html):
# print "call get and params = %s" % text
soup = BeautifulSoup(html, 'html.parser')
vs = soup.find(attrs={'name':'__VIEWSTATE'})
vsg = soup.find(attrs={'name':'__VIEWSTATEGENERATOR'})
ev = soup.find(attrs={'name':'__EVENTVALIDATION'})
dk = DynamicKey(vs['value'], vsg['value'], ev['value'])
# print dk
return dk

def login():
# 初始页面
url = "http://eds.topcheer.com"
# r = s.post(url, headers=headers,data=data)
r = s.get(url, headers=h1)
s.cookies.save(ignore_discard=True)
r.encoding='utf-8'
dk = get_dynamickey(r.text)
data = {
'__EVENTTARGET':'',
'__EVENTARGUMENT':'',
'__VIEWSTATE':dk.VIEWSTATE,
'__VIEWSTATEGENERATOR':dk.VIEWSTATEGENERATOR,
'__EVENTVALIDATION':dk.EVENTVALIDATION,
'tbUserId':'',
'tbPassword':'',
'ctl04':''
}
r = s.post('http://eds.topcheer.com/index.aspx', headers=h1, data=data)
s.cookies.save(ignore_discard=True)
print '登录成功'
return;


# 加班信息查询
def query_overtime_list():
t = datetime.datetime.now()
starttmp = str(t.year)+str(t.month-1)+'01'
start = time.strftime("%Y/%m/%d", time.strptime(starttmp, '%Y%m%d'))
end = time.strftime("%Y/%m/%d", time.localtime())
print start
print end
r = s.post('http://eds.topcheer.com/ajax/EDS.Web.AjaxMethod.Common,EDS.ashx?_method=SetCurrentModelID&_session=rw', headers=h1, data={'modelID':'44'})
r = s.get('http://eds.topcheer.com/UI/AttendanceMng/OvertimeList.aspx', headers=h1)
dk = get_dynamickey(r.text)
data = {
'__EVENTTARGET':'',
'__EVENTARGUMENT':'',
'__VIEWSTATE':dk.VIEWSTATE,
'__VIEWSTATEGENERATOR':dk.VIEWSTATEGENERATOR,
'__EVENTVALIDATION':dk.EVENTVALIDATION,
'datOvertimeStartDate$txt_Date':start,
'datOvertimeStartDate$controlName':'',
'datOvertimeEndDate$txt_Date':end,
'datOvertimeEndDate$controlName':'',
'ddlApplyType':'0',
'ddlOrgName$menuLevelID':'4',
'ddlOrgName$orgID':'17',
'ddlOrgName$OrgIDString':'( 17)',
'txtEmployeeNM':'',
'ddlApplyStatus':'0',
'btnSearch':'',
'gridPage$tbPage':''
}
r = s.post('http://eds.topcheer.com/UI/AttendanceMng/OvertimeList.aspx', headers=h1, data=data)
s.cookies.save(ignore_discard=True)
# print r.text
soup = BeautifulSoup(r.text, 'html.parser')
total = soup.find(attrs={'id':'ulblGridCount'})
print total
tb = soup.find(attrs={'class':'Grid'})
print tb

s = requests.Session()
s.cookies = get_cookies()
login()
query_overtime_list()

半成品

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
#!/usr/bin/python
# -*- coding: UTF-8 -*-

import requests
from http import cookiejar
from bs4 import BeautifulSoup
from PIL import Image
import codecs
import os

REDIRECT = ''
TOKEN = ''
CERT = '/home/hero/XX-Net-3.3.6/data/gae_proxy/certs/www.xxx.com.crt'

headers = {
'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',
'Connection':'keep-alive',
'Host':'www.xxx.com',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) Chrome/59.0.3112.101'
}

def get_cookies():
cj = cookiejar.LWPCookieJar(filename='cookies_xxx')
return cj
try:
cj.load()
except:
print "load cookies failed."
def get_token_redirect():
r = s.get('https://www.xxx.com/', headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')
redir = soup.find(attrs={"name":"redirect"})
REDIRECT = redir['value']
print REDIRECT
token = soup.find(attrs={"name":"token"})
f.write(r.text)
f.close()
TOKEN = token['value']
print TOKEN
s.cookies.save(ignore_discard=True)
print r.headers
def login():
get_token_redirect()
data = {
'token':TOKEN,
'redirect':REDIRECT,
'remember_me':'1',
'from':'pc_login_modal_:index',
'username':'',
'password':'',
'subscribe':'undefined'
}
r = s.post('https://www.xxx.com/front/authenticate', headers=headers, data=data)
print r.text
print r.status_code
s.cookies.save(ignore_discard=True)

f = codecs.open('porn.html', 'w', 'utf-8')
s = requests.Session()
s.cert=CERT
s.cookies = get_cookies()
login()