You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
auto/spider/news_get_apprcn.py

139 lines
5.3 KiB

# -*- coding: utf-8 -*-
'''
反斗限免
1, 获取反斗限免数据
2, 储存到mongodb
3, 发送到指定邮件
'''
import re
import time
from datetime import datetime
import httpx
import sys
import os
sys.path.append(os.path.join(os.path.abspath(__file__).split('auto')[0] + 'auto'))
from utils.utils_mongo_handle import MongoHandle
from utils.utils_logs_handle import LogsHandle
from utils.utils_send_email import SendEmail
from base.base_load_config import load_config
config_json = load_config()
DEFAULT_RE_PUSH_TIMES = config_json['DEFAULT_RE_PUSH_TIMES']
class APPRCN(object):
def __init__(self):
self.logs_handle = LogsHandle()
self.now_day = time.strftime('%Y-%m-%d', time.localtime())
self.base_url = 'https://free.apprcn.com/page/{}/'
self.headers = {
'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; Media Center PC 6.0; InfoPath.2; MS-RTC LM 8'
}
db = 'NEWS'
collection = 'apprcn_info'
self.mongo = MongoHandle(db=db, collection=collection, del_db=False, del_collection=False, auto_remove=0)
self.send_email_datas = []
self.send_email_now = 0
def main(self):
self.logs_handle.logs_write('apprcn', '开始获取反斗限免数据', 'start', False)
response_data = self.req()
if response_data:
self.save_to_mongo(response_data)
if self.send_email_now:
self.send_to_email()
self.logs_handle.logs_write('apprcn', '反斗限免数据获取完成', 'done', False)
print('done')
else:
self.logs_handle.logs_write('apprcn', '无法获取apprcn数据', 'error', False)
def req(self):
urls = ['https://free.apprcn.com/']
for i in range(2, 10):
urls.append(self.base_url.format(i))
response_data = []
for i in urls:
response = httpx.get(url=i, headers=self.headers)
if response.status_code != 200:
self.logs_handle.logs_write('apprcn', '请求失败, 状态码: %s' % response.status_code, 'error', False)
exit(0)
response.encoding = 'utf-8'
content_list = re.findall('<div class="content">([\S\s]*?)<div class="sidebar">', response.text)
# 清理content数据
content = ''
if content_list:
for i in ['\t', '\n']:
content = content_list[0].replace(i, '')
context_list = re.findall('<p class="note">(.*?)</p>', content)
title_list = re.findall('title="(.*?)"', content)
post_date_list = re.findall('<time>(.*?)</time>', content)
source_data_list = re.findall('<a class="cat" href="(.*?)"', content)
for title, context, post_date, source_data in zip(title_list, context_list, post_date_list,
source_data_list):
response_data.append({
"title": title,
"context": context,
"source_url": source_data,
'link': '',
"article_type": '',
"article_source": '',
"img_url": '',
'keyword': '',
"posted_date": post_date,
"create_time": int(time.time()),
"create_datetime": datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
"repush_times": DEFAULT_RE_PUSH_TIMES
})
if response_data:
return response_data
else:
self.logs_handle.logs_write('apprcn', '获取数据失败', 'error', False)
def save_to_mongo(self, data):
print('开始储存 反斗限免 数据')
for data_to_insert in data:
try:
# 检查数据库中是否存在匹配的文档
filter_criteria = {'title': data_to_insert.get('title', '')} # 确保 title 字段有值
count = self.mongo.collection.count_documents(filter_criteria)
if count == 0:
# 如果没有找到匹配的文档,插入新文档
result = self.mongo.collection.insert_one(data_to_insert)
self.send_email_datas.append(data_to_insert)
except TypeError as te:
print('\n%s' % te)
self.logs_handle.logs_write('反斗限免', '写入数据库报错: %s' % te, 'error', False)
return 0
print('储存数据完成', datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
def send_to_email(self):
if self.send_email_datas:
text = ''
for data in self.send_email_datas:
text += '标题: %s\n内容: %s\n时间: %s\n链接: %s\n\n' % (
data['title'], data['context'], data['posted_date'], data['source_url'])
send_email = SendEmail(subject='反斗限免', title='反斗限免', text=text)
send_email.send()
self.logs_handle.logs_write('apprcn', '发送邮件完成', 'done', False)
else:
self.logs_handle.logs_write('apprcn', '没有新数据, 不发送邮件', 'done', False)
if __name__ == "__main__":
APPRCN().main()