You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
auto/spider/spider_web3_coin_world.py

133 lines
5.0 KiB

# -*- coding: utf-8 -*-
'''
币世界 文章板块
'''
import httpx
import os
import sys
from httpx import HTTPStatusError
import re
import time
from datetime import datetime
sys.path.append(os.path.join(os.path.abspath(__file__).split('auto')[0] + 'auto'))
from utils.utils_mongo_handle import MongoHandle
from utils.utils_logs_handle import LogsHandle
from base.base_load_config import load_config
config_json = load_config()
DEFAULT_RE_PUSH_TIMES = config_json['DEFAULT_RE_PUSH_TIMES']
class BiShiJie(object):
def __init__(self):
self.base_url = 'https://www.528btc.com'
self.url = self.base_url + "/e/extend/api/v2/AjaxPageList/"
self.send_email_datas = []
self.send_email_now = 0
self.logs_handle = LogsHandle()
self.now_day = time.strftime('%Y-%m-%d', time.localtime())
self.headers = {
"Accept": "text/html, */*; q=0.01",
"Accept-Encoding": "gzip, deflate, br, zstd",
"Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
"Origin": "https://www.528btc.com",
"Referer": "https://www.528btc.com/kx/",
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:129.0) Gecko/20100101 Firefox/129.0",
"X-Requested-With": "XMLHttpRequest",
}
db = 'NEWS'
collection = '币世界-文章'
self.mongo = MongoHandle(db=db, collection=collection, del_db=False, del_collection=False, auto_remove=0)
def req(self):
max_page_num = 1 + 5
all_data = []
for page in range(1, max_page_num):
form_data = {
"pageIndex": f"{page}",
"module": "newslist-v2",
"classid": "114",
"limitpage": "15"
}
try:
response = httpx.post(self.url, headers=self.headers, data=form_data)
# 检查响应状态码
response.raise_for_status()
html = response.text
div_list = re.findall('<div class="slices_item_content">([\S\s]*?)</div>\n.*?</div>\n.*?</div>', html)
for div in div_list:
title_list = re.findall('<div class="title overflow">(.*?)</div>', div)
title = title_list[0] if len(title_list) > 0 else ''
context_list = re.findall('<div class="introduce overflow">(.*?)</div>', div)
context = context_list[0] if len(context_list) > 0 else ''
source_url_list = re.findall('<a target="_blank" href="(.*?)">', div)
source_url = source_url_list[0] if len(source_url_list) > 0 else ''
article_type_list = re.findall('<span class="span">(.*?)</span>', div)
article_type = article_type_list[0] if len(article_type_list) > 0 else ''
posted_date_list = re.findall('<span class="time">(.*?)</span>', div)
posted_date = posted_date_list[0] if len(posted_date_list) > 0 else ''
all_data.append({
"title": title,
"context": context,
"source_url": '',
'link': self.base_url + source_url,
"article_type": article_type,
"article_source": '',
"img_url": '',
'keyword': article_type,
"posted_date": posted_date,
"create_time": int(time.time()),
"create_datetime": datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
"repush_times": DEFAULT_RE_PUSH_TIMES
})
except HTTPStatusError as http_err:
print(f"HTTP error occurred: {http_err}")
except Exception as err:
print(f"An error occurred: {err}")
return all_data
def save_to_mongo(self, data):
print('开始储存 币世界文章 数据')
for data_to_insert in data:
try:
# 检查数据库中是否存在匹配的文档
filter_criteria = {'title': data_to_insert.get('title', '')} # 确保 title 字段有值
count = self.mongo.collection.count_documents(filter_criteria)
if count == 0:
# 如果没有找到匹配的文档,插入新文档
result = self.mongo.collection.insert_one(data_to_insert)
self.send_email_datas.append(data_to_insert)
except TypeError as te:
print('\n%s' % te)
self.logs_handle.logs_write('币世界-文章', '写入数据库报错: %s' % te, 'error', False)
return 0
print('储存数据完成', datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
def main(self):
all_data = self.req()
if not all_data:
print('数据为空')
exit(0)
self.save_to_mongo(all_data)
if __name__ == '__main__':
BiShiJie().main()