MENU

nonebot博客更新通知插件

• April 14, 2019 • Read: 74 • python

功能大概就是pt之家每次有种子上新都会检测到然后把详细信息发到群里。然后还有一个就是我在闲鱼上接了一个爬虫的单子,实时爬青浦区站点空气质量这个网站的数据,写进数据库同时也写进excel表里。

nonebot插件:

import nonebot
import re
from bs4 import BeautifulSoup
import requests_async as requests
import asyncio
import time
import aioredis
#start_num=36583
group_num=816894914
cookies={
    '__cfduid':'#########################',
    'UM_distinctid':'#########################',
    'CNZZDATA1275677506':'1#########################76',
    'c_secure_ssl':'e#########################D',
    'c_secure_uid':'N#########################D',
    'c_secure_pass':'4#########################9',
    'c_secure_tracker_ssl':'b#########################D',
    'c_secure_login':'b#########################D'
}
headers={'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36'}
async def go():
    loop = asyncio.get_event_loop()
    conn = await aioredis.create_connection('redis://127.0.0.1', loop=loop)
    #await conn.execute('set', 'start_num', start_num)
    val = await conn.execute('get', 'start_num')
    start_num=val.decode('gbk')
    conn.close()
    await conn.wait_closed()
    return start_num
async def parser_html(i,url):
    global start_num
    async with requests.Session() as session:
        response=await session.get(url=url,headers=headers,cookies=cookies)
        #页面解析
        html = BeautifulSoup(response.text, 'html.parser')
        td_list = html.select('.rowfollow')
        if len(td_list)!=0:
            #标题
            title=td_list[1].get_text()
            #链接
            #print(url)
            # 体积
            volume = td_list[2].get_text()
            #print(volume)
            volume_str = re.match(r'大小:(.*?)类型', volume)
            # print(volume_str.group(1))
            if volume_str:
                volume_num = (volume_str.group(1)).replace(' ', '')
                volume_num = volume_num.strip()
            else:
                volume_num = ''
            #print(volume_num)
            msg='新种子:\n'+str(title)+'\n大小:'+str(volume_num)+'\n链接:'+str(url)
            bot = nonebot.get_bot()
            await bot.send_group_msg(group_id=984611554, message=msg)
            await bot.send_group_msg(group_id=188784835, message=msg)
            await bot.send_group_msg(group_id=816894914, message=msg)
            loop = asyncio.get_event_loop()
            conn = await aioredis.create_connection('redis://127.0.0.1', loop=loop)
            val = await conn.execute('get', 'start_num')
            start_num=val.decode('gbk')
            if i>=int(start_num):
                await conn.execute('set', 'start_num', i)
                print('start_num update',i)
            conn.close()
            await conn.wait_closed()
            return
    return

@nonebot.scheduler.scheduled_job('interval', minutes=5)
async def _():
    start_num=await go()
    tasks = []
    for i in range(int(start_num) + 1, int(start_num) + 25):
        url = 'https://www.pthome.net/details.php?id=%d&hit=1' % i
        tasks.append(asyncio.ensure_future(parser_html(i, url)))
    await asyncio.gather(*tasks)
    print('结束一个事件循环')
    bot = nonebot.get_bot()
    msg=str(time.asctime( time.localtime(time.time()) ))+'\t'+'检测是否有新种子发布'+'\t'+'停止检测请在这里随便回复点东西唤醒机器人'
    await bot.send_group_msg(group_id=group_num, message=msg)

原理很简单,就是每隔5分钟爬取一下网页看看有没有更新,每个种子都有对应的id,每次爬完以后更新redis数据库里的id为最新的,然后下次爬的时候从这个id对应的种子开始爬。往上爬25个看看有没有新的种子,有的话就发到群里然后更新数据库里的id。

效果图


另一个爬空气质量的那个,原理类似。

空气质量爬虫:

import requests
import time,json
import schedule
from pymysql import *
import openpyxl
# #数据库设计
# create table yingpu(
#     id int unsigned primary key auto_increment not null,
#     LST_AQI datetime,
#     coAQI smallint unsigned default 0,
#     coValue decimal(5,2) default 0.0,
#     no2AQI smallint unsigned default 0,
#     no2Value smallint unsigned default 0,
#     o3AQI smallint unsigned default 0,
#     o3Value smallint unsigned default 0,
#     o38AQI smallint unsigned default 0,
#     o38Value smallint unsigned default 0,
#     pm10AQI smallint unsigned default 0,
#     pm10Value smallint unsigned default 0,
#     pm10_24Value smallint unsigned default 0,
#     pm25AQI smallint unsigned default 0,
#     pm25Value smallint unsigned default 0,
#     pm25_24Value smallint unsigned default 0,
#     primaryPollutantAQI smallint unsigned default 0,
#     primaryPollutantGrade smallint unsigned default 0,
#     primaryPollutantQuality varchar(200) default '',
#     primaryPollutantType varchar(200) default '',
#     primaryPollutantValue smallint unsigned default 0,
#     so2AQI smallint unsigned default 0,
#     so2Value smallint unsigned default 0,
#     stationID varchar(200) default '',
#     stationName varchar(200) default ''
# );


#请在当前文件夹下新建 log.xlsx

def get_result():
    url='http://demo.ssrvpn.store:8087/AQI/PatrolHandler.do'
    headers={'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36'}
    param={'provider':'SEMCShare.ChildWeb','method':'RegionData'}
    data={'groupID':'215'}
    r = requests.post(url, params=param,headers=headers,data=data)
    r_dict=json.loads(r.text)
    conn = connect(host='39.96.1.252', port=3306, database='aqi', user='root', password='137849', charset='utf8')
    cs = conn.cursor()
    select_sql = 'select * from yingpu where LST_AQI = "%s";' % r_dict[2]['LST_AQI']
    print(select_sql)
    count = cs.execute(select_sql)
    if count == 0:
        print('检测到更新')
        r_dict=r_dict[2]
        info=(str(r_dict['LST_AQI']),int(r_dict['coAQI']),float(r_dict['coValue']),int(r_dict['no2AQI']),int(r_dict['no2Value']),int(r_dict['o3AQI']),int(r_dict['o3Value']),int(r_dict['o38AQI']),int(r_dict['o38Value']),int(r_dict['pm10AQI']),int(r_dict['pm10Value']),int(r_dict['pm10_24Value']),int(r_dict['pm25AQI']),int(r_dict['pm25Value']),int(r_dict['pm25_24Value']),int(r_dict['primaryPollutantAQI']),int(r_dict['primaryPollutantGrade']),str(r_dict['primaryPollutantQuality']),str(r_dict['primaryPollutantType']),int(r_dict['primaryPollutantValue']),int(r_dict['so2AQI']),int(r_dict['so2Value']),str(r_dict['stationID']),str(r_dict['stationName']))
        sql = 'insert into yingpu values(0,"%s",%d,%f,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,"%s","%s",%d,%d,%d,"%s","%s");' % info
        count = cs.execute(sql)
        row=list(info)
        wb = openpyxl.load_workbook("log.xlsx")
        sheet =  wb.active
        sheet.append(row)
        wb.save('log.xlsx')
    conn.commit()
    cs.close()
    conn.close()
    print('检测中')
if __name__ == '__main__':
    schedule.every(50).seconds.do(get_result)
    while True:
        schedule.run_pending()

有不懂的或者需要帮忙请给我发邮件。

Tags: python
Archives QR Code Tip
QR Code for this page
Tipping QR Code