# -*- coding: utf-8 -*- ''' 币世界 文章板块 ''' import os import sys from httpx import HTTPStatusError import re sys.path.append(os.path.join(os.path.abspath(__file__).split('AutoInfo')[0] + 'AutoInfo')) from utils.utils import * config_json = LoadConfig().load_config() DEFAULT_RE_PUSH_TIMES = config_json['DEFAULT_RE_PUSH_TIMES'] class BiShiJie(object): def __init__(self): self.base_url = 'https://www.528btc.com' self.url = self.base_url + "/e/extend/api/v2/AjaxPageList/" self.send_email_datas = [] self.send_email_now = 0 self.logs_handle = LogsHandle() self.now_day = time.strftime('%Y-%m-%d', time.localtime()) self.headers = { "Accept": "text/html, */*; q=0.01", "Accept-Encoding": "gzip, deflate, br, zstd", "Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2", "Origin": "https://www.528btc.com", "Referer": "https://www.528btc.com/kx/", "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:129.0) Gecko/20100101 Firefox/129.0", "X-Requested-With": "XMLHttpRequest", } db = 'NEWS' collection = '币世界-文章' self.mongo = MongoHandle(db=db, collection=collection, del_db=False, del_collection=False, auto_remove=0) def req(self): max_page_num = 1 + 5 all_data = [] for page in range(1, max_page_num): form_data = { "pageIndex": f"{page}", "module": "newslist-v2", "classid": "114", "limitpage": "15" } try: response = httpx.post(self.url, headers=self.headers, data=form_data) # 检查响应状态码 response.raise_for_status() html = response.text div_list = re.findall('
([\S\s]*?)
\n.*?\n.*?', html) for div in div_list: title_list = re.findall('
(.*?)
', div) title = title_list[0] if len(title_list) > 0 else '' context_list = re.findall('
(.*?)
', div) context = context_list[0] if len(context_list) > 0 else '' source_url_list = re.findall('', div) source_url = source_url_list[0] if len(source_url_list) > 0 else '' article_type_list = re.findall('(.*?)', div) article_type = article_type_list[0] if len(article_type_list) > 0 else '' posted_date_list = re.findall('(.*?)', div) posted_date = posted_date_list[0] if len(posted_date_list) > 0 else '' all_data.append({ "title": title, "context": context, "source_url": '', 'link': self.base_url + source_url, "article_type": article_type, "article_source": '', "img_url": '', 'keyword': article_type, "posted_date": posted_date, "create_time": int(time.time()), "create_datetime": datetime.now().strftime('%Y-%m-%d %H:%M:%S'), "repush_times": DEFAULT_RE_PUSH_TIMES }) except HTTPStatusError as http_err: print(f"HTTP error occurred: {http_err}") except Exception as err: print(f"An error occurred: {err}") return all_data def save_to_mongo(self, data): print('开始储存 币世界文章 数据') for data_to_insert in data: try: # 检查数据库中是否存在匹配的文档 filter_criteria = {'title': data_to_insert.get('title', '')} # 确保 title 字段有值 count = self.mongo.collection.count_documents(filter_criteria) if count == 0: # 如果没有找到匹配的文档,插入新文档 result = self.mongo.collection.insert_one(data_to_insert) self.send_email_datas.append(data_to_insert) except TypeError as te: print('\n%s' % te) self.logs_handle.logs_write('币世界-文章', '写入数据库报错: %s' % te, 'error', False) return 0 print('储存数据完成', datetime.now().strftime('%Y-%m-%d %H:%M:%S')) def main(self): all_data = self.req() if not all_data: print('数据为空') exit(0) self.save_to_mongo(all_data) if __name__ == '__main__': BiShiJie().main()