commit efcc609cc5a93dce8a6f43c18355e45b29ce485b Author: jack Date: Wed Nov 12 16:43:01 2025 +0800 first commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..46f0206 --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +.DS_Store +__pycache__/ +*.pyc +.idea + +*/downloads/* \ No newline at end of file diff --git a/dumanwu/main.py b/dumanwu/main.py new file mode 100644 index 0000000..67aab4a --- /dev/null +++ b/dumanwu/main.py @@ -0,0 +1,168 @@ +import time +import re +import os +import sqlite3 +import httpx +from playwright.sync_api import sync_playwright + +current_dir_path = os.path.dirname(os.path.abspath(__file__)) + +comico_key = 'OMzNzNS' +base_url = 'https://www.dumanwu.com' +target_url = base_url + '/' + comico_key + +download_folder = os.path.join(current_dir_path, 'downloads') +if not os.path.exists(download_folder): + os.mkdir(download_folder) + + +def write_db(title, db_path, chapter_folder_name, chapter_url): + conn = sqlite3.connect(db_path) + cursor = conn.cursor() + cursor.execute( + f'CREATE TABLE IF NOT EXISTS {title} (id INTEGER PRIMARY KEY AUTOINCREMENT, chapter_name TEXT, url TEXT, state BOOLEAN DEFAULT 0)' + ) + conn.commit() + + # 检查chapter_name是否已存在 + cursor.execute( + f'SELECT EXISTS(SELECT 1 FROM {title} WHERE chapter_name = ?)', (chapter_folder_name,)) + exists = cursor.fetchone()[0] + + if not exists: + # 如果不存在,则插入新记录 + cursor.execute(f'INSERT INTO {title} (chapter_name, url) VALUES (?, ?)', (chapter_folder_name, chapter_url)) + conn.commit() + + cursor.close() + conn.close() + + +def load_db(title, db_path): + conn = sqlite3.connect(db_path) + cursor = conn.cursor() + cursor.execute(f'SELECT * FROM {title} WHERE state = 0 ORDER BY id ASC') + rows = cursor.fetchall() + cursor.close() + conn.close() + return rows + + +def fetch_page_title(target_url): + with httpx.Client(verify=False) as client: # 设置不验证证书 + response = client.get(target_url) + if response.status_code != 200: + print(f'Error: {response.status_code}') + exit(0) + title = re.findall(r'

(.*?)

', response.text) + if title: + return title[0] + else: + print("Title not found") + exit(0) + + +def fetch_chapter_data(): + with sync_playwright() as playwright: + browser = playwright.chromium.launch( + headless=True, + args=['--ignore-certificate-errors'] + ) + page = browser.new_page() + page.goto(target_url) + + time.sleep(1) + + button_selector = 'body > div > div > div.forminfo > div.chapterList > div.chapterlistload > div > button' + for i in range(3): + try: + page.click(button_selector) + break + except Exception as e: + pass + + page.wait_for_timeout(1000) + + source = page.content() + + ul_list = re.findall('', source, re.DOTALL) + if len(ul_list) > 0: + ul_list = ul_list[0] + else: + return False + + chapter_url_list = re.findall('', ul_list) + chapter_name_list = re.findall('
  • (.*?)
  • ', ul_list) + + chapter_url_list = chapter_url_list[::-1] + chapter_name_list = chapter_name_list[::-1] + + result = {} + + chapter_count = 1 + for chapter_name, chapter_url in zip(chapter_name_list, chapter_url_list): + chapter_count_str = str(chapter_count).zfill(4) + chapter_url = base_url + chapter_url + result[chapter_count_str] = (chapter_name, chapter_url) + chapter_count += 1 + + browser.close() + + return result + + +def fetch_images(data, chapter_folder_name): + data_id = data[0] + chapter_url = data[2] + with sync_playwright() as playwright: + browser = playwright.chromium.launch( + headless=False, + args=['--ignore-certificate-errors'] + ) + page = browser.new_page() + page.goto(chapter_url) + + time.sleep(1) + + html_content = page.content() # 获取渲染后的整个页面HTML + img_list = re.findall('
    ', html_content) + img_list = img_list[0] + urls = re.findall(' div.container > div.row.col-lg-12.col-md-12.col-xs-12') + if not parent_element: + raise Exception(f"{chapter_name} 未找到图片容器") + + # 获取所有图片元素 + img_elements = parent_element.select('img') + total_images = len(img_elements) + print(f'{chapter_name} 共 {total_images} 张图片') + + # 输出图片的 URL + for img in img_elements: + img_url = img.get('src') + if img_url: + img_links.append(img_url) + except Exception as e: + print(f"获取图片时出错: {e}") + raise # 抛出异常,触发重试逻辑 + return img_links + + +def save_urls(folder_path, img_links): + # 定义保存文件路径 + save_path = os.path.join(folder_path, 'img_links.txt') + + # 将图片链接写入文件 + with open(save_path, 'w', encoding='utf-8') as file: + for link in img_links: + file.write(link + '\n') + + print(f"图片链接已保存到: {save_path}") + + +def new_folder(page_title): + # 获取当前脚本所在的目录 + script_dir = os.path.dirname(os.path.abspath(__file__)) + download_dir = os.path.join(script_dir, 'downloads') + if not os.path.exists(script_dir): + os.makedirs(script_dir) + + if page_title: + # 拼接目标文件夹路径 + folder_path = os.path.join(download_dir, page_title) + + # 检查文件夹是否存在,如果不存在则创建 + if not os.path.exists(folder_path): + os.makedirs(folder_path) + + return folder_path + + +def get_chapter_data(client, target_url): + result = {} + page_title = '' + + try: + response = client.get(target_url) + if response.status_code != 200: + raise Exception(f"无法访问 {target_url},状态码: {response.status_code}") + + soup = BeautifulSoup(response.text, 'html.parser') + + # 获取指定选择器下的所有元素 + elements = soup.select( + 'body > div.container > div:nth-child(3) > div:nth-child(2) a') + + # 提取每个元素的 URL 和文本 + for element in elements: + url = element.get('href') + text = element.get_text() + result[text] = base_url + url + except Exception as e: + print(f"获取章节数据时出错: {e}") + raise # 抛出异常,触发重试逻辑 + + return result + + +def main(): + proxy_url = 'http://127.0.0.1:7890' + base_url = 'https://jcomic.net' + herf_url = '/eps/' + # 自定义请求头 + custom_headers = { + "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", + "accept-language": "zh-CN,zh;q=0.9", + "cache-control": "max-age=0", + "cookie": "_gid=GA1.2.724162267.1736817775; _ga_QL6YSDRWEV=GS1.1.1736837388.3.0.1736837388.0.0.0; _ga=GA1.2.1324234734.1736817774; _gat=1", + "priority": "u=0, i", + "sec-ch-ua": '"Google Chrome";v="131", "Chromium";v="131", "Not_A Brand";v="24"', + "sec-ch-ua-mobile": "?0", + "sec-ch-ua-platform": '"macOS"', + "sec-fetch-dest": "document", + "sec-fetch-mode": "navigate", + "sec-fetch-site": "same-origin", + "sec-fetch-user": "?1", + "upgrade-insecure-requests": "1", + "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36" + } + + for comico_url in comico_urls: + target_url = base_url + herf_url + comico_url + print(target_url) + # 最大重试次数 + max_retries = 999 + retry_count = 0 + + while retry_count < max_retries: + try: + # 创建 httpx.Client 实例,并设置自定义请求头 + with httpx.Client(proxies=proxy_url if use_proxy else None, headers=custom_headers) as client: + # 1, 获取页面章节数据 + chapter_data = get_chapter_data(client, target_url) + print(chapter_data) + + # 2, 在当前文件夹下创建一个文件夹,用来保存图片, 文件名称是 title + folder_path = new_folder(comico_url) + + # 3, 遍历章节数据,获取img的链接 + img_links = get_imgs(client, folder_path, chapter_data) + print(img_links) + + # 4, 保存url到新建的文件夹中 + save_urls(folder_path, img_links) + + # 5,遍历 img_links ,将图片保存到 folder_path中, 保存的文件名类似 0001.png + save_img(client, folder_path, img_links) + + # 如果成功执行完成,跳出循环 + print('done!') + break + + except Exception as e: + retry_count += 1 + print(f"发生错误: {e},正在进行第 {retry_count} 次重试...") + if retry_count >= max_retries: + print("已达到最大重试次数,程序终止。") + break + + # 固定延迟 10 分钟(600 秒) + delay = 30 + print(f"等待 {delay} 秒后重试...") + time.sleep(delay) + + +if __name__ == '__main__': + main() diff --git a/zcymh/zcymh.py b/zcymh/zcymh.py new file mode 100644 index 0000000..2feed18 --- /dev/null +++ b/zcymh/zcymh.py @@ -0,0 +1,261 @@ +# -*- coding: utf-8 -*- + +import platform +import time +import random +from datetime import datetime +import re +import os +from pymongo import MongoClient + +from selenium import webdriver +from selenium.webdriver.chrome.options import Options +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.common.by import By +import httpx + + +def browser_opt(): + # 浏览器打开前, 设置浏览器 + os_name = platform.system() + chrome_options = Options() + chrome_options.add_argument('--no-sandbox') + chrome_options.add_argument('--disable-setuid-sandbox') + chrome_options.add_argument('--disable-gpu') + chrome_options.add_argument('--headless') # 添加无头模式参数 + # chrome_options.add_argument('--incognito') # 隐身模式(无痕模式) + # chrome_options.binary_location = r"C:\Program Files (x86)\Google\Chrome\Application\chrome.exe" # 手动指定使用的浏览器位置 + + if os_name == 'Linux': + chrome_options.add_argument('/usr/bin/chromium') # linux 必须指定chromium路径 + else: + pass # 其他系统不需要指定路径 + + browser = webdriver.Chrome(options=chrome_options) + + return browser + + +def browser_open(browser, url): + # 打开浏览器 + browser.get(url) + time.sleep(random.uniform(1, 2)) + return browser + + +def browser_get_page_source(browser): + # 获取当前页面源代码 + return browser.page_source + + +def browser_find_by_selector(browser, selector): + # 通过 css 选择器搜素 + try: + WebDriverWait(browser, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, selector))) + element = browser.find_element(By.CSS_SELECTOR, selector) + if not element: + return None + return element.text + except Exception as e: + print(e) + return None + + +def browser_screenshot(browser): + # 获取当前网页的标题 + title = browser.title + # 获取当前时间的时间戳 + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + # 构建文件名 + filename = f"{title.replace(' ', '')}_{timestamp}.png" + # 保存截图 + browser.save_screenshot(filename) + print(f"保存截图文件: {filename}") + + +def browser_close(browser): + browser.close() + + +def sanitize_filename(string): + # 替换Windows不允许的字符 + allowed_chars = re.compile(r'[<>:"/\\|?*]') + sanitized_filename = allowed_chars.sub('', string) + + # 替换空格为下划线 + sanitized_filename = sanitized_filename.replace(' ', '_') + + # 确保文件名不以点开头 + if sanitized_filename.startswith('.'): + sanitized_filename = '_' + sanitized_filename[1:] + + # 确保文件名不包含两个连续的点 + sanitized_filename = sanitized_filename.replace('..', '.') + + # 确保文件名不是空字符串 + if not sanitized_filename: + sanitized_filename = 'noname' + '_' + str(int(time.time())) + + return sanitized_filename + + +def task1(): + browser = browser_opt() + print(f'正在打开浏览器') + browser = browser_open(browser, url) + print(f'前往 url: {url}') + + page_source = browser_get_page_source(browser) + + # 获取漫画名, 作为文件夹名 + book_name = re.findall('meta property="og:novel:book_name" content="(.*?)"', page_source) + if book_name: + book_name = book_name[0] + + book_name = sanitize_filename(book_name) + else: + print("获取漫画名称失败") + exit(0) + + # 获取每一集的url + all_set = [] + + host = 'https://zcymh.com' + + start_tag = '
      ' + end_tag = '
    ' + start_index = page_source.find(start_tag) + end_index = page_source.find(end_tag, start_index) + if start_index != -1 and end_index != -1: + target_element = page_source[start_index + len(start_tag):end_index] + pattern = r'' + matches = re.findall(pattern, target_element) + set_num = 1 + for match in matches: + title = sanitize_filename(match[0]) + set_url = host + match[1] + # 观看顺序排序, 集名, 集url + all_set.append([str(set_num).zfill(4), title, set_url]) + set_num += 1 + + # 循环每一集的 url, 拿到每集的每一个图片, 存到一个总列表里面 + all_data_list = [] + for set_data in all_set: + browser = browser_open(browser, set_data[2]) + + page_source = browser_get_page_source(browser) + page_list = re.findall('(.*?)', text) + title = title[0] if title else comico_id + print(title) + # 这里先创建一个 txt, 固定获取本次爬取的名称,下面读取数据库的时候, 需要通过这个名称作为表名读出来 + with open(txt_path, 'w', encoding='utf-8') as f: + print('写入当前目标名称') + f.write(title) + + chapters = re.findall(r'
  • (.*?)
  • ', text) + for chapter in chapters: + chapters_data[chapter[1]] = base_url + chapter[0] + + # 创建 sqlite 将数据存起来 + create_db(title) + + for chapter_name, url in chapters_data.items(): + write_to_db(title, chapter_name, url) + print('数据ok') + + +async def get_chapter_list(): + await async_get_chapter_list() + + +def load_db(title): + conn = sqlite3.connect(db_path) + cursor = conn.cursor() + cursor.execute(f'SELECT * FROM {title} WHERE state = 0 ORDER BY id ASC') + rows = cursor.fetchall() + cursor.close() + conn.close() + return rows + + +def change_db_data_state(data_id, t_name): + conn = sqlite3.connect(db_path) + cursor = conn.cursor() + table_name = t_name + id_column = 'id' + id_value = data_id + bool_column = 'state' + sql = f'UPDATE {table_name} SET {bool_column} = 1 WHERE {id_column} = ?' + cursor.execute(sql, (id_value,)) + conn.commit() + cursor.close() + conn.close() + + + + +def scroll_to_percentage(page): + # 滚动浏览器页面 + percentage_list = [i for i in range(5, 101, scroll_speed)] + for percentage in percentage_list: + # 计算页面的指定百分比高度 + height = page.evaluate("() => document.body.scrollHeight") + scroll_position = height * (percentage / 100) + # 跳转到指定的百分比位置 + page.evaluate(f"window.scrollTo({0}, {scroll_position})") + time.sleep(0.5) + + + +def request_chapter_data(title, data_id, chapter_name, chapter_url): + chapter_folder = os.path.join(current_dir_path, 'downloads', title, chapter_name) + with sync_playwright() as playwright: + try: + browser = playwright.chromium.launch(headless=True) + page = browser.new_page() + page.goto(chapter_url) + page.wait_for_load_state('networkidle') + except Exception as e: + print(e) + return False + + # 滚动页面 + print('开始滚动页面') + scroll_to_percentage(page) + page.evaluate("window.scrollTo({top: 0, behavior: 'smooth'})") + scroll_to_percentage(page) + print('滚动完成') + time.sleep(2) + + # 检查一下是否所有的图片都加载完成, 如果没有, 直接退出函数, 然后重新打开浏览器 + html_content = page.content() + check_list = re.findall('img class="lazy-read" src="(.*?)"', html_content) + for l in check_list: + if 'lazy-read.gif' in l: + return False + + # 创建章节文件夹 + if not os.path.exists(chapter_folder): + os.makedirs(chapter_folder) + + # 获取匹配的元素数量 + total_images = page.locator('.lazy-read').count() + + for page_num in range(1, total_images+1): + img_locator = f'body > div.chpater-images > img:nth-child({page_num})' + img_path = os.path.join(chapter_folder, f'{str(page_num).zfill(3)}.png') + page.locator(img_locator).screenshot(path=img_path) + print(f'已下载 {img_path}') + page_num += 1 + + # 下载完当前章节后, 将state字段改为True + print(f'{chapter_name} 已下载完成\n\n') + change_db_data_state(data_id, title) + + browser.close() + return True + + +def main(): + asyncio.run(get_chapter_list()) + + # 开始对每一页的章节进行爬取 + # 先读取当前的目标名称 + title = '' + with open(txt_path, 'r', encoding='utf-8') as f: + title = f.read() + + folder_name = os.path.join(download_folder, title) + if not os.path.exists(folder_name): + os.mkdir(folder_name) + + for retry in range(999): + load_data = load_db(title) + + if not load_data: + print('The database has no data or all done!') + exit(0) + + for data in load_data: + ok = True + data_id = data[0] + chapter_name = data[1] + chapter_url = data[2] + print(f'准备获取图片: {title} {chapter_name}') + ok = request_chapter_data(title, data_id, chapter_name, chapter_url) + if not ok: + print(f'图片加载失败: {title} {chapter_name} 重试\n\n') + time.sleep(5) + break + + +if __name__ == "__main__": + main()