爬虫实战

本文最后更新于:2020年5月31日 凌晨

一: requests

1.1 豆瓣影讯之xpath

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
#!/usr/bin/python3
# -*- coding: utf-8 -*-
from lxml import etree
import requests

## 1.获取目标网站源码
headers = {
'User-Agent': "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
'Referer': "https://cn.bing.com/"
}

url = 'https://movie.douban.com/cinema/nowplaying/beijing/'
res = requests.get(url, headers=headers)

# 网页源码存为text
text = res.text

# response.text 返回的是Unicode格式的数据
# response.content返回的字节流数据,是bytes类型


## 2. 按一定规则提取数据

# 从字符串常量解析HTML文档。
html = etree.HTML(text)

# 获取正在上映[0],即将上映[1]。通过xpath表达式来定位class='lists'
ul = html.xpath("//ul[@class='lists']")[0]

# print(etree.tostring(ul, encoding='utf-8').decode('utf-8'))

# 通过xpath表达式来定位li标签,存为lis列表
lis = ul.xpath("./li")

#构建movies字典
movies = []

for li in lis:
# print(etree.tostring(li, encoding='utf-8').decode('utf-8'))
title = li.xpath("@data-title")
score = li.xpath("@data-score")
duration = li.xpath("@data-duration")
region = li.xpath("@data-region")
direcotr = li.xpath("@data-direcotr")
actors = li.xpath("@data-actors")
thumbnail = li.xpath(".//img/@src")
movie = {
'title': title,
'score': score,
'duration': duration,
'region': region,
'direcotr': direcotr,
'actors': actors,
'thumbnail': thumbnail
}
movies.append(movie)

for movie in movies:
print(movie)

1.2 电影天堂之xpath

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#!/usr/bin/python3
# -*- coding: utf-8 -*-
from lxml import etree
import requests

BASE_DOMAIN = 'https://dytt8.net'

HEADERS = {
'User-Agent': "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
'Referer': "http://whois.chinaz.com/dytt8.net"
}

# 获取每一部电影详情页的url
def get_detail_urls(url):
res = requests.get(url, headers=HEADERS)
# requests库,会默认使用自己猜测得编码方式进行解码,有时会猜错,需要手动指定解码
text = res.content.decode('gbk', errors='ignore')
html = etree.HTML(text) # <Element html at 0x168712c4540>
detail_urls = html.xpath("//table[@class='tbspan']//a/@href")

# 注释掉得等同于下面得列表表达式
# def abc(url):
# return BASE_DOMAIN+url
# index = 0
# for detail_url in detail_urls:
# detail_url = abc(detail_url)
# detail_urls[index] = detail_url
# index += 1

# 列表表达式
# map() 会根据提供的函数对指定序列做映射。
# 处理详情页的url,获取的url加上BASE_DOMAIN才是正确的url
detail_urls = map(lambda url: BASE_DOMAIN + url, detail_urls)

return detail_urls

# 解析详情页面
def parse_detail_page(url):
movie = {}
response = requests.get(url, headers=HEADERS)
text = response.content.decode('gbk', errors='ignore')
html = etree.HTML(text)
# 标题,返回一个字典,加上0就正常了
# title = html.xpath("//div[@class='title_all']//font[@color='#07519a']/text()")[0]
title = html.xpath("//h1/font/text()")[0]
movie['title'] = title
# 海报
zoomE = html.xpath("//div[@id='Zoom']")[0]
imgs = zoomE.xpath(".//img/@src")
cover = imgs[0]
movie['cover'] = cover
# 详情
def parse_info(info, rule):
return info.replace(rule, "").strip()
infos = zoomE.xpath(".//text()")
for index, info in enumerate(infos):
if info.startswith("◎年  代"):
# 把'◎年  代'替换掉,strip()消除year前后空格
info = parse_info(info, "◎年  代")
movie['year'] = info
elif info.startswith("◎产  地"):
info = parse_info(info, "◎产  地")
movie['country'] = info
elif info.startswith("◎类  别"):
info = parse_info(info, "◎类  别")
movie['category'] = info
elif info.startswith("◎豆瓣评分"):
info = parse_info(info, "◎豆瓣评分")
movie['douban_rating'] = info
elif info.startswith("◎片  长"):
info = parse_info(info, "◎片  长")
movie['duration'] = info
elif info.startswith("◎导  演"):
info = parse_info(info, "◎导  演")
movie['director'] = info
elif info.startswith("◎主  演"):
info = parse_info(info, "◎主  演")
actors = [info]
for x in range(index + 1, len(infos)):
actor = infos[x].strip()
if actor.startswith("◎"):
break
actors.append(actor)
movie['actors'] = info
elif info.startswith("◎简  介"):
info = infos[index + 1].strip()
movie["profile"] = info
# 下载地址
downloadUrl = html.xpath("//td[@bgcolor='#fdfddf']/a/text()")[0]
movie["downloadUrl"] = downloadUrl
return movie

# 爬取
def spider():
base_url = 'https://dytt8.net/html/gndy/dyzz/list_23_{}.html'
# 获取1到n页
movies = []
for x in range(1, 2):
# format 格式化函数 x替代{}
url = base_url.format(x)
detail_urls = get_detail_urls(url)
# 遍历详情页内信息
for detail_url in detail_urls:
movie = parse_detail_page(detail_url)
movies.append(movie)
print(movie)

if __name__ == "__main__":
spider()

1.3 古诗文之正则表达式re

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import requests
import re

def parse_page(url):
headers = {
'User-Agent': "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50"
}
response = requests.get(url, headers)
text = response.text
# 获取古诗文标题
# .不能匹配\n,所以加上re.DOTALL
titles = re.findall(r'<div\sclass="cont">.*?<b>(.*?)</b>', text, re.DOTALL)
# 朝代 ,re.S == re.DOTALL
dynasties = re.findall(r'<p class="source">.*?<a.*?>(.*?)</a>', text, re.S)
# 作者
authors = re.findall(r'<p class="source">.*?<a.*?>.*?<a.*?>(.*?)</a>', text, re.S)
# 诗文正文
content_tags = re.findall(r'<div class="contson" .*?>(.*?)</div>', text, re.S)
contents = []
for content in content_tags:
# .strip()去掉换行
content = re.sub(r'<.*?>|\n', "", content).strip()
contents.append(content)
# 整理格式
poems = []
for value in zip(titles, dynasties, authors, contents):
title, dynasty, author, content = value
poem = {
'title': title,
'dynasty': dynasty,
'author': author,
'content': content
}
poems.append(poem)
for poem in poems:
print(poem)

def main():
url = 'https://www.gushiwen.org/default_1.aspx'
for i in range(1, 4):
url = 'https://www.gushiwen.org/default_%s.aspx' % i
parse_page(url)

if __name__ == '__main__':
main()

二: 文件存储之json和csv

2.1 json

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import json

persons = [
{
'name': '张三',
'age': 20,
'city': 'beijing'
},
{
'name': '李四',
'age': 20,
'city': 'shanghai'
}
]
# 打印用dumps,存为json文件用dump,有中文则需指定ensure_ascii=False
print(json.dumps(persons, ensure_ascii=False))
with open('persons.json', 'w', encoding='utf-8') as fp:
json.dump(persons, fp, ensure_ascii=False)

# 读取json文件
with open('persons.json', 'r', encoding='utf-8') as fp:
persons = json.load(fp)
print(persons)

2.2 csv

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import csv

def read_list():
with open('temp.csv', 'r') as fp:
# reader是一个迭代器
reader = csv.reader(fp)
# 不获取第一行
next(reader)
for x in reader:
time = x[1]
volumn = x[-2]
print({'time': time, 'volumn': volumn})

def read_dict():
with open('temp1.csv', 'r') as fp:
# 不包含标题那行数据
reader = csv.DictReader(fp)
for x in reader:
print(x)

students = ['name', 'age', 'height']

def write_list():
values = [
{'张三', 18, 170},
{'李四', 20, 180}
]
# newline='\n'为默认,连续多行则指定为空字符串
with open("temp.csv", 'w', encoding='utf-8', newline='') as fp:
writer = csv.writer(fp)
writer.writerow(students)
# 写入多行
writer.writerows(values)

def write_dict():
values = [
{'name':'张三', 'age': 18, 'height':170},
{'name':'李四', 'age': 19, 'height':190}
]
with open("temp1.csv", 'w', encoding='utf-8', newline='') as fp:
writer = csv.DictWriter(fp, students)
# 写入表头数据的时候,需要调用writeheader方法
writer.writeheader()
writer.writerows(values)

if __name__ == '__main__':
main()

三: 数据库操作之MySQL

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import pymysql

db = pymysql.connect(host='192.168.2.4', user='root', passwd='root',
db='test', port=3306)

def insert_db(cursor):
# 如果是变量,必须都用%s
sql = """
insert into students(class_id,name,gender,score) values(%s,%s,%s,%s)
"""
class_id = '4';name = '小龙';gender = 'M';score = '99'
# 执行sql语句
cursor.execute(sql, (class_id, name, gender, score))

def find_db(cursor):
sql = """select * from students"""
cursor.execute(sql)
# print(cursor.fetchall()[-1])
result = cursor.fetchall()
for temp in result:
print(temp)

def update_db(cursor):
sql = """update students set score='100' where name='小龙'"""
cursor.execute(sql)


def delete_db(cursor):
sql = """delete from students where name='小龙'"""
cursor.execute(sql)

if __name__ == '__main__':
cursor = db.cursor()
insert_db(cursor)
update_db(cursor)
delete_db(cursor)
find_db(cursor)
db.commit()
db.close()

四: 多线程

4.1 多线程类操作

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import threading, time

class CodingThread(threading.Thread):
def run(self):
for x in range(3):
print('正在写代码%s' % threading.current_thread())
time.sleep(1)

class DrawingThread(threading.Thread):
def run(self):
for x in range(3):
print('正在画图%s' % threading.current_thread())
time.sleep(1)

def main():
t1 = CodingThread()
t2 = DrawingThread()
t1.start()
t2.start()

if __name__ == '__main__':
main()

4.2 多线程加锁解锁

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import threading

VALUE = 0
gLock = threading.Lock()

def add_value():
global VALUE
# 加锁
gLock.acquire()
for x in range(1000000):
VALUE += 1
# 解锁
gLock.release()
print('value: %d' % VALUE)

def main():
for x in range(2):
# 注意函数名后面不要加括号,加上括号返回的则是函数的返回值
t = threading.Thread(target=add_value)
t.start()

if __name__ == '__main__':
main()

4.3 Condition锁

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import threading
import random
import time

gMoney = 1000
gCondition = threading.Condition()
gTimes = 0
gTotalTimes = 5

class Producer(threading.Thread):
def run(self):
global gMoney
global gCondition
global gTimes
while True:
money = random.randint(100, 1000)
gCondition.acquire()
if gTimes >= gTotalTimes:
gCondition.release()
print('当前生产者总共生产了%s次' % gTimes)
break
gMoney += money
print('%s当前存入%s元钱,剩余%s元钱' % (threading.current_thread(), money, gMoney))
gTimes += 1
time.sleep(0.5)
gCondition.notify_all()
gCondition.release()

class Consumer(threading.Thread):
def run(self):
global gMoney
global gCondition
while True:
money = random.randint(100, 500)
gCondition.acquire()
# 这里要给个while循环判断,因为等轮到这个线程的时候
# 条件有可能又不满足了
while gMoney < money:
if gTimes >= gTotalTimes:
gCondition.release()
return
print('%s准备取%s元钱,剩余%s元钱,不足!' % (threading.current_thread(), money, gMoney))
gCondition.wait()
gMoney -= money
print('%s当前取出%s元钱,剩余%s元钱' % (threading.current_thread(), money, gMoney))
time.sleep(0.5)
gCondition.release()

def main():
for x in range(5):
Consumer(name='消费者线程%d' % x).start()

for x in range(2):
Producer(name='生产者线程%d' % x).start()

if __name__ == '__main__':
main()

4.4 斗图啦

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import threading
import requests
from lxml import etree
from urllib import request
import os
import re
from queue import Queue

# 获取每个表情的url
class Producer(threading.Thread):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'
}

def __init__(self, page_queue, img_queue, *args, **kwargs):
super(Producer, self).__init__(*args, **kwargs)
self.page_queue = page_queue
self.img_queue = img_queue

def run(self):
while True:
if self.page_queue.empty():
break
url = self.page_queue.get()
self.parse_page(url)

def parse_page(self, url):
response = requests.get(url, headers=self.headers)
text = response.text
html = etree.HTML(text)
# 获取class不等于gif的图片内容
imgs = html.xpath("//div[@class='page-content text-center']//img[@class!='gif']")
for img in imgs:
img_url = img.get('data-original')
# img_url = img.xpath(".//@data-original")[0]
# 提取图片url的后缀名
suffix = os.path.splitext(img_url)[1]
# suffix = '.' + img_url.split('.')[-1]
alt = img.get('alt')
# alt = img.xpath(".//@alt")[0]
# 替换掉文件名中的,。??,/\\·等不支持的文件名字符
alt = re.sub(r'[,。??,/\\·*!《]', '', alt)
img_name = alt + suffix
# 图片的url和name放入队列
self.img_queue.put((img_url, img_name))

# 下载表情
class Consumer(threading.Thread):
def __init__(self, page_queue, img_queue, *args, **kwargs):
super(Consumer, self).__init__(*args, **kwargs)
self.page_queue = page_queue
self.img_queue = img_queue

def run(self):
while True:
if self.img_queue.empty() and self.page_queue.empty():
return
url, filename = self.img_queue.get(block=True)
# 下载图片放入images目录下
request.urlretrieve(url, 'images/' + filename)
print(filename + ' 下载完成!')

def main():
page_queue = Queue(100)
img_queue = Queue(500)
for x in range(1, 10):
url = "http://www.doutula.com/photo/list/?page=%d" % x
# url 添加到队列中
page_queue.put(url)

# 判断是否创建images目录用于存放图片
if os.path.exists('images/') is False:
os.makedirs('images/')

for x in range(5):
t = Producer(page_queue, img_queue)
t.start()

for x in range(5):
t = Consumer(page_queue, img_queue)
t.start()

if __name__ == '__main__':
main()

五: selenium

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import time
import random
from selenium import webdriver
import xlwt
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

frontCar = '1234'
countNum = 0
# 此处改为具体姓名信息
class_one = ['小马']
class_two = ['大飞', '小猪']
class_three = ['李四', '王五']
class_four = ["张三"]

# 登录模块
def DengLu(driver):
# 访问网页
print('即将打开登录界面')

# 平台登录界面 # 此处改为查验url
driver.get("你的查验url")

# 输入账号 # 改为你的账号
driver.find_element_by_xpath('/html/body/ngx-app/rt-login/div[2]/div[2]/input').send_keys(
"改为你的账号")

# 输入密码
driver.find_element_by_xpath('/html/body/ngx-app/rt-login/div[2]/div[3]/input').send_keys("改为你的密码")

# 手动输入验证码并登录
print("请手动输入图片验证码并登录...")
print("登录后请点开需要查验的第一辆...")

# 输入拉取次数
while True:
try:
# 如果输入是一个整数,开始拉取数据
n = int(input('请输入拉取得数据个数:'))
if n > 5000:
print("数字太大了,请重新输入:")
continue
break
except ValueError:
print('输入的不是整数,请重新输入:')

return n


# 获取数据模块
def StartGetDate(driver, fp, sheet, i, n):
# 全局变量frontCar
global frontCar, countNum

# 查验时间
ChaYanFullTime = driver.find_element_by_xpath(
'//*[@id="app-root"]/app-main-layout/div/div/div/app-check-manage/app-check-manage-detial/div/app-ireport/div/div[2]/table/tbody/tr[7]/td[5]/span').text

# 牌
chepai = driver.find_element_by_xpath(
'//*[@id="app-root"]/app-main-layout/div/div/div/app-check-manage/app-check-manage-detial/div/app-ireport/div/div[2]/table/tbody/tr[5]/td[3]/span').text

localCar = ChaYanFullTime + chepai

# 如何当前和上一辆一致,则获取上一辆失败,再次点击
while frontCar == localCar:
try:
# 点击上一辆以供查看
driver.find_element_by_xpath(
'//*[@id="app-root"]/app-main-layout/div/div/div/app-check-manage/app-check-manage-detial/div/div[2]/img[1]').click()
# 等待牌加载完成
element = WebDriverWait(driver, 60).until(
EC.presence_of_element_located((By.XPATH,
'//*[@id="app-root"]/app-main-layout/div/div/div/app-check-manage/app-check-manage-detial/div/app-ireport/div/div[2]/table/tbody/tr[5]/td[3]/span'))
)
# 等待时间加载完成
element = WebDriverWait(driver, 60).until(
EC.presence_of_element_located((By.XPATH,
'//*[@id="app-root"]/app-main-layout/div/div/div/app-check-manage/app-check-manage-detial/div/app-ireport/div/div[2]/table/tbody/tr[7]/td[5]/span'))
)
# 等待上一辆填充完成
element = WebDriverWait(driver, 60).until(
EC.presence_of_element_located((By.XPATH,
'//*[@id="app-root"]/app-main-layout'))
)
time.sleep(1)
except find_element_by_class_xpath:
print("已经全部拉取完毕。")
# driver.quit()
finally:
time.sleep(random.randint(1, 2))

# 查验时间
ChaYanFullTime = driver.find_element_by_xpath(
'//*[@id="app-root"]/app-main-layout/div/div/div/app-check-manage/app-check-manage-detial/div/app-ireport/div/div[2]/table/tbody/tr[7]/td[5]/span').text

# 牌
chepai = driver.find_element_by_xpath(
'//*[@id="app-root"]/app-main-layout/div/div/div/app-check-manage/app-check-manage-detial/div/app-ireport/div/div[2]/table/tbody/tr[5]/td[3]/span').text

localCar = ChaYanFullTime + chepai

# 查验日期
ChaYanDate = ChaYanFullTime.split(' ')[0]

# 查验时间
ChaYanTime = ChaYanFullTime.split(' ')[-1]

# 辆类型
cheliangleixing = driver.find_element_by_xpath(
'//*[@id="app-root"]/app-main-layout/div/div/div/app-check-manage/app-check-manage-detial/div/app-ireport/div/div[2]/table/tbody/tr[5]/td[5]/span').text

# 金额
jinE = driver.find_element_by_xpath(
'//*[@id="app-root"]/app-main-layout/div/div/div/app-check-manage/app-check-manage-detial/div/app-ireport/div/div[2]/table/tbody/tr[8]/td[3]/span').text

shouFeiYuan = driver.find_element_by_xpath(
'//*[@id="app-root"]/app-main-layout/div/div/div/app-check-manage/app-check-manage-detial/div/app-ireport/div/div[2]/table/tbody/tr[11]/td[7]/span').text

# 稽查员
jiChaYuan = driver.find_element_by_xpath(
'//*[@id="app-root"]/app-main-layout/div/div/div/app-check-manage/app-check-manage-detial/div/app-ireport/div/div[2]/table/tbody/tr[12]/td[3]/span').text

# 班次
if jiChaYuan in class_three:
banci = 3
elif jiChaYuan in class_one:
banci = 1
elif jiChaYuan in class_two:
banci = 2
elif jiChaYuan in class_four:
banci = 4
else:
banci = None

# 查验结果
chayanResult = driver.find_element_by_xpath(
'//*[@id="app-root"]/app-main-layout/div/div/div/app-check-manage/app-check-manage-detial/div/app-ireport/div/div[2]/table/tbody/tr[9]/td[3]/span').text

# 开始打印
print(i + 1, "/", n, ': ', banci, ChaYanFullTime, shouFeiYuan, jiChaYuan, chepai, cheliangleixing, jinE, end=' ')
print(ChaYanFullTime, banci, shouFeiYuan, jiChaYuan, chepai, cheliangleixing, jinE, end=' ', file=fp)

# 写入不同得数据
sheet.write(i + 2, 0, i + 1)
sheet.write(i + 2, 1, ChaYanDate)
sheet.write(i + 2, 2, ChaYanTime)
sheet.write(i + 2, 3, banci)
# sheet.write(i + 2, 4, daoKou)
sheet.write(i + 2, 5, shouFeiYuan)
sheet.write(i + 2, 6, jiChaYuan)
sheet.write(i + 2, 7, chepai)
sheet.write(i + 2, 8, cheliangleixing)

# 不合格则添加不符原因
if chayanResult == '不合格':
# 不合格原因
buhege = driver.find_element_by_xpath(
'/html/body/ngx-app/app-main-layout/div/div/div/app-check-manage/app-check-manage-detial/div/app-ireport/div/div[2]/table/tbody/tr[10]/td[3]/span').text
print(buhege, end=' ')
print(buhege, end=' ', file=fp)
sheet.write(i + 2, 9, buhege)
else:
print(end=' ')
print(end=' ', file=fp)
sheet.write(i + 2, 9, '')

sheet.write(i + 2, 10, jinE)

# 换行
print(end='\n')
print(end='\n', file=fp)

# 后赋予前
frontCar = localCar

# 统计打印数量
countNum = countNum + 1


def main():

# 实例化一个启动参数对象,获取chrome的配置
options = Options()

# 设置启动浏览器窗口大小
options.add_argument('--window-size=1366,768')

# 启动浏览器
print('Launch browser')
# 获取driver对象
driver = webdriver.Chrome(chrome_options=options)

# 删除cookie
print('Delete cookies')
driver.delete_all_cookies()

# 开始登录
n = DengLu(driver)

# 统计开始时间
startTime = time.time()

# 写入文件
fp = open(r"temp.txt", "w+", encoding="utf-8")
# 写入excel
workbook = xlwt.Workbook()
sheet = workbook.add_sheet("sheet1")

for i in range(n):
StartGetDate(driver, fp, sheet, i, n)
if i % 10 == 9:
# 每10辆保存一次
workbook.save('temp.xls')

# 关闭文件
fp.close()
# 保存为temp.xls
workbook.save('temp.xls')

# 脚本运行成功,退出浏览器
driver.quit()

# 统计结束时间
endTime = time.time()
print("共 %d 辆" % countNum)
print("共花费时间: %d 秒" % (endTime - startTime))

if __name__ == '__main__':
main()

参考

Python爬虫教程

selenium官方docs

selenium-python中文文档

Scrapy中文文档