import time import re import os import sqlite3 import httpx from playwright.sync_api import sync_playwright current_dir_path = os.path.dirname(os.path.abspath(__file__)) comico_key = 'OMzNzNS' base_url = 'https://www.dumanwu.com' target_url = base_url + '/' + comico_key download_folder = os.path.join(current_dir_path, 'downloads') if not os.path.exists(download_folder): os.mkdir(download_folder) def write_db(title, db_path, chapter_folder_name, chapter_url): conn = sqlite3.connect(db_path) cursor = conn.cursor() cursor.execute( f'CREATE TABLE IF NOT EXISTS {title} (id INTEGER PRIMARY KEY AUTOINCREMENT, chapter_name TEXT, url TEXT, state BOOLEAN DEFAULT 0)' ) conn.commit() # 检查chapter_name是否已存在 cursor.execute( f'SELECT EXISTS(SELECT 1 FROM {title} WHERE chapter_name = ?)', (chapter_folder_name,)) exists = cursor.fetchone()[0] if not exists: # 如果不存在,则插入新记录 cursor.execute(f'INSERT INTO {title} (chapter_name, url) VALUES (?, ?)', (chapter_folder_name, chapter_url)) conn.commit() cursor.close() conn.close() def load_db(title, db_path): conn = sqlite3.connect(db_path) cursor = conn.cursor() cursor.execute(f'SELECT * FROM {title} WHERE state = 0 ORDER BY id ASC') rows = cursor.fetchall() cursor.close() conn.close() return rows def fetch_page_title(target_url): with httpx.Client(verify=False) as client: # 设置不验证证书 response = client.get(target_url) if response.status_code != 200: print(f'Error: {response.status_code}') exit(0) title = re.findall(r'

(.*?)

', response.text) if title: return title[0] else: print("Title not found") exit(0) def fetch_chapter_data(): with sync_playwright() as playwright: browser = playwright.chromium.launch( headless=True, args=['--ignore-certificate-errors'] ) page = browser.new_page() page.goto(target_url) time.sleep(1) button_selector = 'body > div > div > div.forminfo > div.chapterList > div.chapterlistload > div > button' for i in range(3): try: page.click(button_selector) break except Exception as e: pass page.wait_for_timeout(1000) source = page.content() ul_list = re.findall('', source, re.DOTALL) if len(ul_list) > 0: ul_list = ul_list[0] else: return False chapter_url_list = re.findall('', ul_list) chapter_name_list = re.findall('
  • (.*?)
  • ', ul_list) chapter_url_list = chapter_url_list[::-1] chapter_name_list = chapter_name_list[::-1] result = {} chapter_count = 1 for chapter_name, chapter_url in zip(chapter_name_list, chapter_url_list): chapter_count_str = str(chapter_count).zfill(4) chapter_url = base_url + chapter_url result[chapter_count_str] = (chapter_name, chapter_url) chapter_count += 1 browser.close() return result def fetch_images(data, chapter_folder_name): data_id = data[0] chapter_url = data[2] with sync_playwright() as playwright: browser = playwright.chromium.launch( headless=False, args=['--ignore-certificate-errors'] ) page = browser.new_page() page.goto(chapter_url) time.sleep(1) html_content = page.content() # 获取渲染后的整个页面HTML img_list = re.findall('
    ([\S\s]*?)
    ', html_content) img_list = img_list[0] urls = re.findall('