You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
133 lines
5.0 KiB
133 lines
5.0 KiB
# -*- coding: utf-8 -*-
|
|
'''
|
|
币世界 文章板块
|
|
'''
|
|
import httpx
|
|
import os
|
|
import sys
|
|
from httpx import HTTPStatusError
|
|
import re
|
|
import time
|
|
from datetime import datetime
|
|
|
|
sys.path.append(os.path.join(os.path.abspath(__file__).split('auto')[0] + 'auto'))
|
|
|
|
from utils.utils_mongo_handle import MongoHandle
|
|
from utils.utils_logs_handle import LogsHandle
|
|
|
|
from base.base_load_config import load_config
|
|
|
|
config_json = load_config()
|
|
DEFAULT_RE_PUSH_TIMES = config_json['DEFAULT_RE_PUSH_TIMES']
|
|
|
|
|
|
class BiShiJie(object):
|
|
def __init__(self):
|
|
self.base_url = 'https://www.528btc.com'
|
|
self.url = self.base_url + "/e/extend/api/v2/AjaxPageList/"
|
|
self.send_email_datas = []
|
|
self.send_email_now = 0
|
|
self.logs_handle = LogsHandle()
|
|
self.now_day = time.strftime('%Y-%m-%d', time.localtime())
|
|
self.headers = {
|
|
"Accept": "text/html, */*; q=0.01",
|
|
"Accept-Encoding": "gzip, deflate, br, zstd",
|
|
"Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
|
|
"Origin": "https://www.528btc.com",
|
|
"Referer": "https://www.528btc.com/kx/",
|
|
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:129.0) Gecko/20100101 Firefox/129.0",
|
|
"X-Requested-With": "XMLHttpRequest",
|
|
}
|
|
db = 'NEWS'
|
|
collection = '币世界-文章'
|
|
self.mongo = MongoHandle(db=db, collection=collection, del_db=False, del_collection=False, auto_remove=0)
|
|
|
|
def req(self):
|
|
max_page_num = 1 + 5
|
|
all_data = []
|
|
for page in range(1, max_page_num):
|
|
|
|
form_data = {
|
|
"pageIndex": f"{page}",
|
|
"module": "newslist-v2",
|
|
"classid": "114",
|
|
"limitpage": "15"
|
|
}
|
|
|
|
try:
|
|
response = httpx.post(self.url, headers=self.headers, data=form_data)
|
|
|
|
# 检查响应状态码
|
|
response.raise_for_status()
|
|
|
|
html = response.text
|
|
|
|
div_list = re.findall('<div class="slices_item_content">([\S\s]*?)</div>\n.*?</div>\n.*?</div>', html)
|
|
|
|
for div in div_list:
|
|
title_list = re.findall('<div class="title overflow">(.*?)</div>', div)
|
|
title = title_list[0] if len(title_list) > 0 else ''
|
|
|
|
context_list = re.findall('<div class="introduce overflow">(.*?)</div>', div)
|
|
context = context_list[0] if len(context_list) > 0 else ''
|
|
|
|
source_url_list = re.findall('<a target="_blank" href="(.*?)">', div)
|
|
source_url = source_url_list[0] if len(source_url_list) > 0 else ''
|
|
|
|
article_type_list = re.findall('<span class="span">(.*?)</span>', div)
|
|
article_type = article_type_list[0] if len(article_type_list) > 0 else ''
|
|
|
|
posted_date_list = re.findall('<span class="time">(.*?)</span>', div)
|
|
posted_date = posted_date_list[0] if len(posted_date_list) > 0 else ''
|
|
|
|
all_data.append({
|
|
"title": title,
|
|
"context": context,
|
|
"source_url": '',
|
|
'link': self.base_url + source_url,
|
|
"article_type": article_type,
|
|
"article_source": '',
|
|
"img_url": '',
|
|
'keyword': article_type,
|
|
"posted_date": posted_date,
|
|
"create_time": int(time.time()),
|
|
"create_datetime": datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
|
|
"repush_times": DEFAULT_RE_PUSH_TIMES
|
|
})
|
|
|
|
except HTTPStatusError as http_err:
|
|
print(f"HTTP error occurred: {http_err}")
|
|
except Exception as err:
|
|
print(f"An error occurred: {err}")
|
|
return all_data
|
|
|
|
def save_to_mongo(self, data):
|
|
print('开始储存 币世界文章 数据')
|
|
for data_to_insert in data:
|
|
try:
|
|
# 检查数据库中是否存在匹配的文档
|
|
filter_criteria = {'title': data_to_insert.get('title', '')} # 确保 title 字段有值
|
|
count = self.mongo.collection.count_documents(filter_criteria)
|
|
if count == 0:
|
|
# 如果没有找到匹配的文档,插入新文档
|
|
result = self.mongo.collection.insert_one(data_to_insert)
|
|
self.send_email_datas.append(data_to_insert)
|
|
|
|
except TypeError as te:
|
|
print('\n%s' % te)
|
|
self.logs_handle.logs_write('币世界-文章', '写入数据库报错: %s' % te, 'error', False)
|
|
return 0
|
|
print('储存数据完成', datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
|
|
|
|
def main(self):
|
|
all_data = self.req()
|
|
|
|
if not all_data:
|
|
print('数据为空')
|
|
exit(0)
|
|
|
|
self.save_to_mongo(all_data)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
BiShiJie().main()
|
|
|