爬虫实战

本文最后更新于：2020年5月31日凌晨

一： requests

1.1 豆瓣影讯之xpath

#!/usr/bin/python3
# -*- coding: utf-8 -*-
from lxml import etree
import requests

## 1.获取目标网站源码
headers = {
    'User-Agent': "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
    'Referer': "https://cn.bing.com/"
}

url = 'https://movie.douban.com/cinema/nowplaying/beijing/'
res = requests.get(url, headers=headers)

# 网页源码存为text
text = res.text

# response.text 返回的是Unicode格式的数据
# response.content返回的字节流数据，是bytes类型


## 2. 按一定规则提取数据

# 从字符串常量解析HTML文档。
html = etree.HTML(text)

# 获取正在上映[0],即将上映[1]。通过xpath表达式来定位class='lists'
ul = html.xpath("//ul[@class='lists']")[0]

# print(etree.tostring(ul, encoding='utf-8').decode('utf-8'))

# 通过xpath表达式来定位li标签，存为lis列表
lis = ul.xpath("./li")

#构建movies字典
movies = []

for li in lis:
    # print(etree.tostring(li, encoding='utf-8').decode('utf-8'))
    title = li.xpath("@data-title")
    score = li.xpath("@data-score")
    duration = li.xpath("@data-duration")
    region = li.xpath("@data-region")
    direcotr = li.xpath("@data-direcotr")
    actors = li.xpath("@data-actors")
    thumbnail = li.xpath(".//img/@src")
    movie = {
        'title': title,
        'score': score,
        'duration': duration,
        'region': region,
        'direcotr': direcotr,
        'actors': actors,
        'thumbnail': thumbnail
    }
    movies.append(movie)

for movie in movies:
    print(movie)

1.2 电影天堂之xpath

#!/usr/bin/python3
# -*- coding: utf-8 -*-
from lxml import etree
import requests

BASE_DOMAIN = 'https://dytt8.net'

HEADERS = {
    'User-Agent': "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
    'Referer': "http://whois.chinaz.com/dytt8.net"
}

# 获取每一部电影详情页的url
def get_detail_urls(url):
    res = requests.get(url, headers=HEADERS)
    # requests库，会默认使用自己猜测得编码方式进行解码，有时会猜错，需要手动指定解码
    text = res.content.decode('gbk', errors='ignore')
    html = etree.HTML(text)  # <Element html at 0x168712c4540>
    detail_urls = html.xpath("//table[@class='tbspan']//a/@href")

    # 注释掉得等同于下面得列表表达式
    # def abc(url):
    #     return BASE_DOMAIN+url
    # index = 0
    # for detail_url in detail_urls:
    #     detail_url = abc(detail_url)
    #     detail_urls[index] = detail_url
    #     index += 1

    # 列表表达式
    # map() 会根据提供的函数对指定序列做映射。
    # 处理详情页的url，获取的url加上BASE_DOMAIN才是正确的url
    detail_urls = map(lambda url: BASE_DOMAIN + url, detail_urls)

    return detail_urls

# 解析详情页面
def parse_detail_page(url):
    movie = {}
    response = requests.get(url, headers=HEADERS)
    text = response.content.decode('gbk', errors='ignore')
    html = etree.HTML(text)
    # 标题，返回一个字典，加上0就正常了
    # title = html.xpath("//div[@class='title_all']//font[@color='#07519a']/text()")[0]
    title = html.xpath("//h1/font/text()")[0]
    movie['title'] = title
    # 海报
    zoomE = html.xpath("//div[@id='Zoom']")[0]
    imgs = zoomE.xpath(".//img/@src")
    cover = imgs[0]
    movie['cover'] = cover
    # 详情
    def parse_info(info, rule):
        return info.replace(rule, "").strip()
    infos = zoomE.xpath(".//text()")
    for index, info in enumerate(infos):
        if info.startswith("◎年　　代"):
            # 把'◎年　　代'替换掉，strip()消除year前后空格
            info = parse_info(info, "◎年　　代")
            movie['year'] = info
        elif info.startswith("◎产　　地"):
            info = parse_info(info, "◎产　　地")
            movie['country'] = info
        elif info.startswith("◎类　　别"):
            info = parse_info(info, "◎类　　别")
            movie['category'] = info
        elif info.startswith("◎豆瓣评分"):
            info = parse_info(info, "◎豆瓣评分")
            movie['douban_rating'] = info
        elif info.startswith("◎片　　长"):
            info = parse_info(info, "◎片　　长")
            movie['duration'] = info
        elif info.startswith("◎导　　演"):
            info = parse_info(info, "◎导　　演")
            movie['director'] = info
        elif info.startswith("◎主　　演"):
            info = parse_info(info, "◎主　　演")
            actors = [info]
            for x in range(index + 1, len(infos)):
                actor = infos[x].strip()
                if actor.startswith("◎"):
                    break
                actors.append(actor)
            movie['actors'] = info
        elif info.startswith("◎简　　介"):
            info = infos[index + 1].strip()
            movie["profile"] = info
    # 下载地址
    downloadUrl = html.xpath("//td[@bgcolor='#fdfddf']/a/text()")[0]
    movie["downloadUrl"] = downloadUrl
    return movie

# 爬取
def spider():
    base_url = 'https://dytt8.net/html/gndy/dyzz/list_23_{}.html'
    # 获取1到n页
    movies = []
    for x in range(1, 2):
        # format 格式化函数 x替代{}
        url = base_url.format(x)
        detail_urls = get_detail_urls(url)
        # 遍历详情页内信息
        for detail_url in detail_urls:
            movie = parse_detail_page(detail_url)
            movies.append(movie)
            print(movie)

if __name__ == "__main__":
    spider()

1.3 古诗文之正则表达式re

#!/usr/bin/python3
# -*- coding: utf-8 -*-
import requests
import re

def parse_page(url):
    headers = {
        'User-Agent': "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50"
    }
    response = requests.get(url, headers)
    text = response.text
    # 获取古诗文标题
    # .不能匹配\n，所以加上re.DOTALL
    titles = re.findall(r'<div\sclass="cont">.*?<b>(.*?)</b>', text, re.DOTALL)
    # 朝代 ,re.S == re.DOTALL
    dynasties = re.findall(r'<p class="source">.*?<a.*?>(.*?)</a>', text, re.S)
    # 作者
    authors = re.findall(r'<p class="source">.*?<a.*?>.*?<a.*?>(.*?)</a>',  text, re.S)
    # 诗文正文
    content_tags = re.findall(r'<div class="contson" .*?>(.*?)</div>',  text, re.S)
    contents = []
    for content in content_tags:
        # .strip()去掉换行
        content = re.sub(r'<.*?>|\n', "", content).strip()
        contents.append(content)
    # 整理格式
    poems = []
    for value in zip(titles, dynasties, authors, contents):
        title, dynasty, author, content = value
        poem = {
            'title': title,
            'dynasty': dynasty,
            'author': author,
            'content': content
        }
        poems.append(poem)
    for poem in poems:
        print(poem)

def main():
    url = 'https://www.gushiwen.org/default_1.aspx'
    for i in range(1, 4):
        url = 'https://www.gushiwen.org/default_%s.aspx' % i
        parse_page(url)

if __name__ == '__main__':
    main()

二：文件存储之json和csv

2.1 json

#!/usr/bin/python3
# -*- coding: utf-8 -*-
import json

persons = [
    {
        'name': '张三',
        'age': 20,
        'city': 'beijing'
    },
    {
        'name': '李四',
        'age': 20,
        'city': 'shanghai'
    }
]
# 打印用dumps，存为json文件用dump，有中文则需指定ensure_ascii=False
print(json.dumps(persons, ensure_ascii=False))
with open('persons.json', 'w', encoding='utf-8') as fp:
    json.dump(persons, fp, ensure_ascii=False)

# 读取json文件
with open('persons.json', 'r', encoding='utf-8') as fp:
    persons = json.load(fp)
    print(persons)

2.2 csv

#!/usr/bin/python3
# -*- coding: utf-8 -*-
import csv

def read_list():
    with open('temp.csv', 'r') as fp:
        # reader是一个迭代器
        reader = csv.reader(fp)
        # 不获取第一行
        next(reader)
        for x in reader:
            time = x[1]
            volumn = x[-2]
            print({'time': time, 'volumn': volumn})

def read_dict():
    with open('temp1.csv', 'r') as fp:
        # 不包含标题那行数据
        reader = csv.DictReader(fp)
        for x in reader:
            print(x)

students = ['name', 'age', 'height']

def write_list():
    values = [
        {'张三', 18, 170},
        {'李四', 20, 180}
    ]
    # newline='\n'为默认，连续多行则指定为空字符串
    with open("temp.csv", 'w', encoding='utf-8', newline='') as fp:
        writer = csv.writer(fp)
        writer.writerow(students)
        # 写入多行
        writer.writerows(values)

def write_dict():
    values = [
        {'name':'张三', 'age': 18, 'height':170},
        {'name':'李四', 'age': 19, 'height':190}
    ]
    with open("temp1.csv", 'w', encoding='utf-8', newline='') as fp:
        writer = csv.DictWriter(fp, students)
        # 写入表头数据的时候，需要调用writeheader方法
        writer.writeheader()
        writer.writerows(values)

if __name__ == '__main__':
    main()

三：数据库操作之MySQL

#!/usr/bin/python3
# -*- coding: utf-8 -*-
import pymysql

db = pymysql.connect(host='192.168.2.4', user='root', passwd='root',
                     db='test', port=3306)

def insert_db(cursor):
    # 如果是变量，必须都用%s
    sql = """
        insert into students(class_id,name,gender,score) values(%s,%s,%s,%s)
    """
    class_id = '4';name = '小龙';gender = 'M';score = '99'
    # 执行sql语句
    cursor.execute(sql, (class_id, name, gender, score))

def find_db(cursor):
    sql = """select * from students"""
    cursor.execute(sql)
    # print(cursor.fetchall()[-1])
    result = cursor.fetchall()
    for temp in result:
        print(temp)

def update_db(cursor):
    sql = """update students set score='100' where name='小龙'"""
    cursor.execute(sql)


def delete_db(cursor):
    sql = """delete from students where name='小龙'"""
    cursor.execute(sql)

if __name__ == '__main__':
    cursor = db.cursor()
    insert_db(cursor)
    update_db(cursor)
    delete_db(cursor)
    find_db(cursor)
    db.commit()
    db.close()

四：多线程

4.1 多线程类操作

#!/usr/bin/python3
# -*- coding: utf-8 -*-
import threading, time

class CodingThread(threading.Thread):
    def run(self):
        for x in range(3):
            print('正在写代码%s' % threading.current_thread())
            time.sleep(1)

class DrawingThread(threading.Thread):
    def run(self):
        for x in range(3):
            print('正在画图%s' % threading.current_thread())
            time.sleep(1)

def main():
    t1 = CodingThread()
    t2 = DrawingThread()
    t1.start()
    t2.start()

if __name__ == '__main__':
    main()

4.2 多线程加锁解锁

#!/usr/bin/python3
# -*- coding: utf-8 -*-
import threading

VALUE = 0
gLock = threading.Lock()

def add_value():
    global VALUE
    # 加锁
    gLock.acquire()
    for x in range(1000000):
        VALUE += 1
    # 解锁
    gLock.release()
    print('value: %d' % VALUE)

def main():
    for x in range(2):
        # 注意函数名后面不要加括号，加上括号返回的则是函数的返回值
        t = threading.Thread(target=add_value)
        t.start()

if __name__ == '__main__':
    main()

4.3 Condition锁

import threading
import random
import time

gMoney = 1000
gCondition = threading.Condition()
gTimes = 0
gTotalTimes = 5

class Producer(threading.Thread):
    def run(self):
        global gMoney
        global gCondition
        global gTimes
        while True:
            money = random.randint(100, 1000)
            gCondition.acquire()
            if gTimes >= gTotalTimes:
                gCondition.release()
                print('当前生产者总共生产了%s次' % gTimes)
                break
            gMoney += money
            print('%s当前存入%s元钱，剩余%s元钱' % (threading.current_thread(), money, gMoney))
            gTimes += 1
            time.sleep(0.5)
            gCondition.notify_all()
            gCondition.release()

class Consumer(threading.Thread):
    def run(self):
        global gMoney
        global gCondition
        while True:
            money = random.randint(100, 500)
            gCondition.acquire()
            # 这里要给个while循环判断，因为等轮到这个线程的时候
            # 条件有可能又不满足了
            while gMoney < money:
                if gTimes >= gTotalTimes:
                    gCondition.release()
                    return
                print('%s准备取%s元钱，剩余%s元钱，不足！' % (threading.current_thread(), money, gMoney))
                gCondition.wait()
            gMoney -= money
            print('%s当前取出%s元钱，剩余%s元钱' % (threading.current_thread(), money, gMoney))
            time.sleep(0.5)
            gCondition.release()

def main():
    for x in range(5):
        Consumer(name='消费者线程%d' % x).start()

    for x in range(2):
        Producer(name='生产者线程%d' % x).start()

if __name__ == '__main__':
    main()

4.4 斗图啦

import threading
import requests
from lxml import etree
from urllib import request
import os
import re
from queue import Queue

# 获取每个表情的url
class Producer(threading.Thread):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'
    }

    def __init__(self, page_queue, img_queue, *args, **kwargs):
        super(Producer, self).__init__(*args, **kwargs)
        self.page_queue = page_queue
        self.img_queue = img_queue

    def run(self):
        while True:
            if self.page_queue.empty():
                break
            url = self.page_queue.get()
            self.parse_page(url)

    def parse_page(self, url):
        response = requests.get(url, headers=self.headers)
        text = response.text
        html = etree.HTML(text)
        # 获取class不等于gif的图片内容
        imgs = html.xpath("//div[@class='page-content text-center']//img[@class!='gif']")
        for img in imgs:
            img_url = img.get('data-original')
            # img_url = img.xpath(".//@data-original")[0]
            # 提取图片url的后缀名
            suffix = os.path.splitext(img_url)[1]
            # suffix = '.' + img_url.split('.')[-1]
            alt = img.get('alt')
            # alt = img.xpath(".//@alt")[0]
            # 替换掉文件名中的，。？?,/\\·等不支持的文件名字符
            alt = re.sub(r'[，。？?,/\\·*！《]', '', alt)
            img_name = alt + suffix
            # 图片的url和name放入队列
            self.img_queue.put((img_url, img_name))

# 下载表情
class Consumer(threading.Thread):
    def __init__(self, page_queue, img_queue, *args, **kwargs):
        super(Consumer, self).__init__(*args, **kwargs)
        self.page_queue = page_queue
        self.img_queue = img_queue

    def run(self):
        while True:
            if self.img_queue.empty() and self.page_queue.empty():
                return
            url, filename = self.img_queue.get(block=True)
            # 下载图片放入images目录下
            request.urlretrieve(url, 'images/' + filename)
            print(filename + '  下载完成！')

def main():
    page_queue = Queue(100)
    img_queue = Queue(500)
    for x in range(1, 10):
        url = "http://www.doutula.com/photo/list/?page=%d" % x
        # url 添加到队列中
        page_queue.put(url)

    # 判断是否创建images目录用于存放图片
    if os.path.exists('images/') is False:
        os.makedirs('images/')

    for x in range(5):
        t = Producer(page_queue, img_queue)
        t.start()

    for x in range(5):
        t = Consumer(page_queue, img_queue)
        t.start()

if __name__ == '__main__':
    main()

五： selenium

#!/usr/bin/python3
# -*- coding: utf-8 -*-
import time
import random
from selenium import webdriver
import xlwt
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

frontCar = '1234'
countNum = 0
# 此处改为具体姓名信息
class_one = ['小马']
class_two = ['大飞', '小猪']
class_three = ['李四', '王五']
class_four = ["张三"]

# 登录模块
def DengLu(driver):
    # 访问网页
    print('即将打开登录界面')

    # 平台登录界面 # 此处改为查验url
    driver.get("你的查验url")

    # 输入账号 # 改为你的账号
    driver.find_element_by_xpath('/html/body/ngx-app/rt-login/div[2]/div[2]/input').send_keys(
        "改为你的账号")

    # 输入密码
    driver.find_element_by_xpath('/html/body/ngx-app/rt-login/div[2]/div[3]/input').send_keys("改为你的密码")

    # 手动输入验证码并登录
    print("请手动输入图片验证码并登录...")
    print("登录后请点开需要查验的第一辆...")

    # 输入拉取次数
    while True:
        try:
            # 如果输入是一个整数，开始拉取数据
            n = int(input('请输入拉取得数据个数：'))
            if n > 5000:
                print("数字太大了,请重新输入:")
                continue
            break
        except ValueError:
            print('输入的不是整数，请重新输入:')

    return n


# 获取数据模块
def StartGetDate(driver, fp, sheet, i, n):
    # 全局变量frontCar
    global frontCar, countNum

    # 查验时间
    ChaYanFullTime = driver.find_element_by_xpath(
        '//*[@id="app-root"]/app-main-layout/div/div/div/app-check-manage/app-check-manage-detial/div/app-ireport/div/div[2]/table/tbody/tr[7]/td[5]/span').text

    # 牌
    chepai = driver.find_element_by_xpath(
        '//*[@id="app-root"]/app-main-layout/div/div/div/app-check-manage/app-check-manage-detial/div/app-ireport/div/div[2]/table/tbody/tr[5]/td[3]/span').text

    localCar = ChaYanFullTime + chepai

    # 如何当前和上一辆一致，则获取上一辆失败，再次点击
    while frontCar == localCar:
        try:
            # 点击上一辆以供查看
            driver.find_element_by_xpath(
                '//*[@id="app-root"]/app-main-layout/div/div/div/app-check-manage/app-check-manage-detial/div/div[2]/img[1]').click()
            # 等待牌加载完成
            element = WebDriverWait(driver, 60).until(
                EC.presence_of_element_located((By.XPATH,
                                                '//*[@id="app-root"]/app-main-layout/div/div/div/app-check-manage/app-check-manage-detial/div/app-ireport/div/div[2]/table/tbody/tr[5]/td[3]/span'))
            )
            # 等待时间加载完成
            element = WebDriverWait(driver, 60).until(
                EC.presence_of_element_located((By.XPATH,
                                                '//*[@id="app-root"]/app-main-layout/div/div/div/app-check-manage/app-check-manage-detial/div/app-ireport/div/div[2]/table/tbody/tr[7]/td[5]/span'))
            )
            # 等待上一辆填充完成
            element = WebDriverWait(driver, 60).until(
                EC.presence_of_element_located((By.XPATH,
                                                '//*[@id="app-root"]/app-main-layout'))
            )
            time.sleep(1)
        except find_element_by_class_xpath:
            print("已经全部拉取完毕。")
            # driver.quit()
        finally:
            time.sleep(random.randint(1, 2))

        # 查验时间
        ChaYanFullTime = driver.find_element_by_xpath(
            '//*[@id="app-root"]/app-main-layout/div/div/div/app-check-manage/app-check-manage-detial/div/app-ireport/div/div[2]/table/tbody/tr[7]/td[5]/span').text

        # 牌
        chepai = driver.find_element_by_xpath(
            '//*[@id="app-root"]/app-main-layout/div/div/div/app-check-manage/app-check-manage-detial/div/app-ireport/div/div[2]/table/tbody/tr[5]/td[3]/span').text

        localCar = ChaYanFullTime + chepai

    # 查验日期
    ChaYanDate = ChaYanFullTime.split(' ')[0]

    # 查验时间
    ChaYanTime = ChaYanFullTime.split(' ')[-1]

    # 辆类型
    cheliangleixing = driver.find_element_by_xpath(
        '//*[@id="app-root"]/app-main-layout/div/div/div/app-check-manage/app-check-manage-detial/div/app-ireport/div/div[2]/table/tbody/tr[5]/td[5]/span').text

    # 金额
    jinE = driver.find_element_by_xpath(
        '//*[@id="app-root"]/app-main-layout/div/div/div/app-check-manage/app-check-manage-detial/div/app-ireport/div/div[2]/table/tbody/tr[8]/td[3]/span').text

    shouFeiYuan = driver.find_element_by_xpath(
        '//*[@id="app-root"]/app-main-layout/div/div/div/app-check-manage/app-check-manage-detial/div/app-ireport/div/div[2]/table/tbody/tr[11]/td[7]/span').text

    # 稽查员
    jiChaYuan = driver.find_element_by_xpath(
        '//*[@id="app-root"]/app-main-layout/div/div/div/app-check-manage/app-check-manage-detial/div/app-ireport/div/div[2]/table/tbody/tr[12]/td[3]/span').text

    # 班次
    if jiChaYuan in class_three:
        banci = 3
    elif jiChaYuan in class_one:
        banci = 1
    elif jiChaYuan in class_two:
        banci = 2
    elif jiChaYuan in class_four:
        banci = 4
    else:
        banci = None

    # 查验结果
    chayanResult = driver.find_element_by_xpath(
        '//*[@id="app-root"]/app-main-layout/div/div/div/app-check-manage/app-check-manage-detial/div/app-ireport/div/div[2]/table/tbody/tr[9]/td[3]/span').text

    # 开始打印
    print(i + 1, "/", n, ':  ', banci, ChaYanFullTime, shouFeiYuan, jiChaYuan, chepai, cheliangleixing, jinE, end=' ')
    print(ChaYanFullTime, banci, shouFeiYuan, jiChaYuan, chepai, cheliangleixing, jinE, end=' ', file=fp)

    # 写入不同得数据
    sheet.write(i + 2, 0, i + 1)
    sheet.write(i + 2, 1, ChaYanDate)
    sheet.write(i + 2, 2, ChaYanTime)
    sheet.write(i + 2, 3, banci)
    # sheet.write(i + 2, 4, daoKou)
    sheet.write(i + 2, 5, shouFeiYuan)
    sheet.write(i + 2, 6, jiChaYuan)
    sheet.write(i + 2, 7, chepai)
    sheet.write(i + 2, 8, cheliangleixing)

    # 不合格则添加不符原因
    if chayanResult == '不合格':
        # 不合格原因
        buhege = driver.find_element_by_xpath(
            '/html/body/ngx-app/app-main-layout/div/div/div/app-check-manage/app-check-manage-detial/div/app-ireport/div/div[2]/table/tbody/tr[10]/td[3]/span').text
        print(buhege, end=' ')
        print(buhege, end=' ', file=fp)
        sheet.write(i + 2, 9, buhege)
    else:
        print(end=' ')
        print(end=' ', file=fp)
        sheet.write(i + 2, 9, '')

    sheet.write(i + 2, 10, jinE)

    # 换行
    print(end='\n')
    print(end='\n', file=fp)

    # 后赋予前
    frontCar = localCar

    # 统计打印数量
    countNum = countNum + 1


def main():

    # 实例化一个启动参数对象,获取chrome的配置
    options = Options()

    # 设置启动浏览器窗口大小
    options.add_argument('--window-size=1366,768')

    # 启动浏览器
    print('Launch browser')
    # 获取driver对象
    driver = webdriver.Chrome(chrome_options=options)

    # 删除cookie
    print('Delete cookies')
    driver.delete_all_cookies()

    # 开始登录
    n = DengLu(driver)

    # 统计开始时间
    startTime = time.time()

    # 写入文件
    fp = open(r"temp.txt", "w+", encoding="utf-8")
    # 写入excel
    workbook = xlwt.Workbook()
    sheet = workbook.add_sheet("sheet1")

    for i in range(n):
        StartGetDate(driver, fp, sheet, i, n)
        if i % 10 == 9:
            # 每10辆保存一次
            workbook.save('temp.xls')

    # 关闭文件
    fp.close()
    # 保存为temp.xls
    workbook.save('temp.xls')

    # 脚本运行成功,退出浏览器
    driver.quit()

    # 统计结束时间
    endTime = time.time()
    print("共 %d 辆" % countNum)
    print("共花费时间： %d 秒" % (endTime - startTime))

if __name__ == '__main__':
    main()