MENU

asyncio打造高并发爬虫

• April 12, 2019 • Read: 100 • python

学了这么长时间的协程和asyncio,今天终于实战了一次。一次性爬取了1500多个网页,体会到了异步协程的强大

事情是这个样子的。昨天晚上,pt之家的管理员老哥找我帮忙写脚本统计他们第一届发种大赛的结果情况。然后我就写呗,一开始用的同步代码写,写好之后爬了两个多小时才爬完,噗。发种大赛现在还没结束,这要是等结束了再爬数据,那岂不是得爬五六个小时。今天早上爬起来之后用requests-async和aiomysql这俩框架,把代码改成异步的了,有多快你可能想不到,用了差不多5分钟连带着爬1000多个网页并且写进数据库里,太强了。

先看一下我的成果:pthome.lemea.co 。点发种人显示他参加发种大赛的所有种子,再点种子就是种子的详细信息了。然后看一下我的爬虫代码:

数据库设计

create table firstevent(
    id int unsigned primary key auto_increment not null,
    name varchar(200) default '',
    title varchar(200) default '',
    url varchar(100) default '',
    volume varchar(10) default '',
    medium varchar(20) default '',
    resolution varchar(20) default '',
    done_num_str smallint unsigned default 0,
    magic_value int unsigned default 0,
    thank_num smallint unsigned default 0,
    user_name varchar(100) default '',
    user_url varchar(200) default '',
    production_team varchar(20) default '',
    score decimal(5,2) default 0.0
);

spider.py

cookies和数据库已经打码

import requests_async as requests
import asyncio
import aiomysql
from bs4 import BeautifulSoup
import re
torrents_num=36490
cookies={
    '__cfduid':'************************************',
    'UM_distinctid':'************************************',
    'CNZZDATA1275677506':'************************************',
    'c_secure_ssl':'************************************',
    'c_secure_uid':'************************************',
    'c_secure_pass':'************************************',
    'c_secure_tracker_ssl':'************************************',
    'c_secure_login':'************************************'
}
headers={'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36'}
async def parser_html(url,loop,sem):
    async with sem:
        async with requests.Session() as session:
            response=await session.get(url=url,headers=headers,cookies=cookies)
            #页面解析
            html = BeautifulSoup(response.text, 'html.parser')
            td_list = html.select('.rowfollow')
            if len(td_list)!=0:
                #标题
                title=td_list[1].get_text()
                title = (re.compile('"')).sub(' ', title)
                title = (re.compile("'")).sub(' ', title)
                print(title)
                #print(re.match(r'.*第一届发种大赛.*',title))
                if re.match(r'.*第一届发种大赛.*',title):
                    #种子名称
                    name=(td_list[0].select('a'))[0].get_text()

                    print(name)
                    #链接
                    print(url)
                    #体积
                    volume=td_list[2].get_text()
                    print(volume)
                    volume_str=re.match(r'大小:(.*?)类型',volume)
                    #print(volume_str.group(1))
                    if volume_str:
                        volume_num=(volume_str.group(1)).replace(' ','')
                        volume_num=volume_num.strip()
                    else:
                        volume_num=''
                    print(volume_num)
                    #---
                    if re.match(r'(.*?)GB',volume_num):
                        volume_score = (re.match(r'(.*?)GB', volume_num)).group(1)
                        volume_score=float(volume_score)/10
                    elif re.match(r'(.*?)MB',volume_num):
                        volume_score=(re.match(r'(.*?)MB', volume_num)).group(1)
                        volume_score=float(volume_score)/10240
                    elif re.match(r'(.*?)TB',volume_num):
                        volume_score = (re.match(r'(.*?)TB', volume_num)).group(1)
                        volume_score = float(volume_score)*1024
                        volume_score=volume_score/10
                    else:
                        volume_score=0

                    #媒介
                    #print(volume)
                    if re.match(r'.*媒介: (.*?)编',volume):
                        medium=(re.match(r'.*媒介: (.*?)编',volume)).group(1)
                        medium = medium.replace(' ', '')
                        medium=medium.strip()
                    else:
                        medium=''
                    print(medium)
                    #分辨率
                    if re.match(r'.*分辨率:(.{3,6})',volume):
                        resolution=(re.match(r'.*分辨率:(.{3,6})',volume)).group(1)
                        resolution=resolution.replace(' ','')
                        resolution=resolution.strip()
                    else:
                        resolution=''
                    print(resolution)
                    # 完成数
                    tmp = 0
                    thank_num_list=[]
                    for item in td_list:
                        if re.match(r'.*完成:(.*)次', item.get_text()):
                            done_num_str = (re.match(r'.*完成:(.*)次', item.get_text())).group(1)
                        if re.match(r'.*得(.*)个魔力值奖', item.get_text()):
                            magic_value = (re.match(r'.*得(.*)个魔力值奖', item.get_text())).group(1)
                            thank_num = len(td_list[tmp + 1].select('a'))

                        thank_num_list.append(len(item.select('.User_Name')))
                        tmp = tmp + 1
                    #感谢数分
                    thank_num=float(thank_num_list[-1])
                    thank_score=thank_num/50
                    #魔力值分
                    magic_value_score=float(magic_value)/1000

                    #完成数分
                    done_score=float(done_num_str)/10
                    #发布员
                    #print((td_list[0].select('a'))[-1])
                    user_url='https://www.pthome.net/%s' % ((td_list[0].select('a'))[-1])['href']
                    user_name=((td_list[0].select('a'))[-1]).get_text()
                    print(user_url,user_name)
                    #制作组
                    if re.match(r'.*制作组:(.{3,7})',volume):
                        production_team=(re.match(r'.*制作组:(.{3,7})',volume)).group(1)
                        production_team = production_team.replace(' ', '')
                        production_team = production_team.strip()
                    else:
                        production_team=''
                    print(production_team)
                    #质量分
                    #媒介分

                    if re.match(r'.*(Blu-ray(原盘)).*',medium):
                        medium_score=7
                    elif re.match(r'.*(DVD(原盘)).*',medium):
                        medium_score=6
                    elif re.match(r'.*(REMUX).*',medium):
                        medium_score=5
                    elif re.match(r'.*(HDTV).*',medium):
                        medium_score=2
                    elif re.match(r'.*(WEB-DL).*',medium):
                        medium_score=2
                    elif re.match(r'.*(Recode).*',medium):
                        medium_score=3
                    elif re.match(r'.*(CD).*',medium):
                        medium_score=3
                    else:
                        medium_score=0
                    #制作组分
                    if re.match(r'.*(PTHome).*',production_team):
                        production_team_score=6
                    else:
                        production_team_score=0
                    score=float(volume_score)+float(done_score)+float(thank_score)+float(magic_value_score)+float(medium_score)+float(production_team_score)
                    score=round(score,2)
                    print('分数:',score)

                    async with aiomysql.create_pool(host='*********', port=3306, db='******', user='******', password='******',loop=loop) as pool:
                        async with pool.acquire() as conn:
                            async with conn.cursor() as cs:
                                sql = "select * from firstevent where url='%s';" % url
                                print(sql)
                                count=await cs.execute(sql)
                                if count == 0:
                                    sql = """insert into firstevent values(0,"%s","%s","%s","%s","%s","%s",%d,%d,%d,"%s","%s","%s",%f);""" % (
                                        name,title, url, volume_num, medium, resolution, int(done_num_str), int(magic_value),
                                        int(thank_num),
                                        user_name, user_url,production_team,score)
                                    print(sql)
                                    await cs.execute(sql)
                                await conn.commit()
                        pool.close()
                        await pool.wait_closed()
                        print('插入数据库成功')


if __name__ == '__main__':
    tasks=[]
    loop = asyncio.get_event_loop()
    sem = asyncio.Semaphore(60)
    for i in range(35190, torrents_num):
        print(i)
        url = 'https://www.pthome.net/details.php?id=%d&hit=1' % i
        print(url)
        tasks.append(asyncio.ensure_future(parser_html(url,loop,sem)))
    loop.run_until_complete(asyncio.wait(tasks))

然后用之前写的mini-web框架魔改之后搭了个网页把数据可视化了。分享一下代码:https://share.weiyun.com/5r586JZ

爬下来的数据:pthome.lemea.co

Tags: python
最后编辑于: April 17, 2019
Archives QR Code Tip
QR Code for this page
Tipping QR Code
Leave a Comment