first commit

main
jack 4 weeks ago
commit efcc609cc5
  1. 6
      .gitignore
  2. 168
      dumanwu/main.py
  3. 205
      jcomic/main.py
  4. 261
      zcymh/zcymh.py
  5. 45
      zhuimh/merge_images.py
  6. 204
      zhuimh/zhuimh.py

6
.gitignore vendored

@ -0,0 +1,6 @@
.DS_Store
__pycache__/
*.pyc
.idea
*/downloads/*

@ -0,0 +1,168 @@
import time
import re
import os
import sqlite3
import httpx
from playwright.sync_api import sync_playwright
current_dir_path = os.path.dirname(os.path.abspath(__file__))
comico_key = 'OMzNzNS'
base_url = 'https://www.dumanwu.com'
target_url = base_url + '/' + comico_key
download_folder = os.path.join(current_dir_path, 'downloads')
if not os.path.exists(download_folder):
os.mkdir(download_folder)
def write_db(title, db_path, chapter_folder_name, chapter_url):
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
cursor.execute(
f'CREATE TABLE IF NOT EXISTS {title} (id INTEGER PRIMARY KEY AUTOINCREMENT, chapter_name TEXT, url TEXT, state BOOLEAN DEFAULT 0)'
)
conn.commit()
# 检查chapter_name是否已存在
cursor.execute(
f'SELECT EXISTS(SELECT 1 FROM {title} WHERE chapter_name = ?)', (chapter_folder_name,))
exists = cursor.fetchone()[0]
if not exists:
# 如果不存在,则插入新记录
cursor.execute(f'INSERT INTO {title} (chapter_name, url) VALUES (?, ?)', (chapter_folder_name, chapter_url))
conn.commit()
cursor.close()
conn.close()
def load_db(title, db_path):
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
cursor.execute(f'SELECT * FROM {title} WHERE state = 0 ORDER BY id ASC')
rows = cursor.fetchall()
cursor.close()
conn.close()
return rows
def fetch_page_title(target_url):
with httpx.Client(verify=False) as client: # 设置不验证证书
response = client.get(target_url)
if response.status_code != 200:
print(f'Error: {response.status_code}')
exit(0)
title = re.findall(r'<p></p><h1 class="name_mh">(.*?)</h1><p></p>', response.text)
if title:
return title[0]
else:
print("Title not found")
exit(0)
def fetch_chapter_data():
with sync_playwright() as playwright:
browser = playwright.chromium.launch(
headless=True,
args=['--ignore-certificate-errors']
)
page = browser.new_page()
page.goto(target_url)
time.sleep(1)
button_selector = 'body > div > div > div.forminfo > div.chapterList > div.chapterlistload > div > button'
for i in range(3):
try:
page.click(button_selector)
break
except Exception as e:
pass
page.wait_for_timeout(1000)
source = page.content()
ul_list = re.findall('<ul>(.*?)</ul>', source, re.DOTALL)
if len(ul_list) > 0:
ul_list = ul_list[0]
else:
return False
chapter_url_list = re.findall('<a href="(.*?)">', ul_list)
chapter_name_list = re.findall('<li>(.*?)</li>', ul_list)
chapter_url_list = chapter_url_list[::-1]
chapter_name_list = chapter_name_list[::-1]
result = {}
chapter_count = 1
for chapter_name, chapter_url in zip(chapter_name_list, chapter_url_list):
chapter_count_str = str(chapter_count).zfill(4)
chapter_url = base_url + chapter_url
result[chapter_count_str] = (chapter_name, chapter_url)
chapter_count += 1
browser.close()
return result
def fetch_images(data, chapter_folder_name):
data_id = data[0]
chapter_url = data[2]
with sync_playwright() as playwright:
browser = playwright.chromium.launch(
headless=False,
args=['--ignore-certificate-errors']
)
page = browser.new_page()
page.goto(chapter_url)
time.sleep(1)
html_content = page.content() # 获取渲染后的整个页面HTML
img_list = re.findall('<div class="main_img"><div class="chapter-img-box">([\S\s]*?)</a></div>', html_content)
img_list = img_list[0]
urls = re.findall('<img (src="|data-src=")(.*?)"', img_list)
for url in urls:
page.goto(url)
browser.close()
def main():
print(target_url)
# ------------------------------ step1 ------------------------------
title = fetch_page_title(target_url)
comico_folder = os.path.join(download_folder, title)
if not os.path.exists(comico_folder):
os.mkdir(comico_folder)
# 创建 chapter db, 保存 chapter 数据
db_path = os.path.join(comico_folder, 'comico.db')
# 获取章节的 title, url
chapter_data = fetch_chapter_data()
for k, v in chapter_data.items():
chapter_url = v[1]
write_db(title, db_path, k + '_' + v[0], chapter_url)
# ------------------------------ step2 ------------------------------
all_data = load_db(title, db_path)
for data in all_data:
chapter_folder_name = os.path.join(comico_folder, data[1])
if not os.path.exists(chapter_folder_name):
os.mkdir(chapter_folder_name)
fetch_images(data, chapter_folder_name)
time.sleep(999)
if __name__ == '__main__':
main()

@ -0,0 +1,205 @@
import os
import time
import random
import httpx
from bs4 import BeautifulSoup
comico_urls = [
'[PIXIV] LotteryFate (18900473)(AI)',
]
# 是否使用代理
use_proxy = 1
def save_img(client, folder_path, img_links):
for index, img_url in enumerate(img_links, start=1):
try:
# 生成文件名,例如 0001.png, 0002.png
file_name = f"{str(index).zfill(4)}.png"
file_path = os.path.join(folder_path, file_name)
# 检查文件是否已经存在
if os.path.exists(file_path):
print(f"文件已存在,跳过下载: {file_path}")
continue
# 发送请求获取图片内容
response = client.get(img_url)
if response.status_code != 200:
raise Exception(
f"无法下载图片 {img_url},状态码: {response.status_code}")
# 保存图片到本地
with open(file_path, 'wb') as file:
file.write(response.content)
print(f"图片已保存: {file_path}")
except Exception as e:
raise Exception(f"下载图片 {img_url} 时出错: {e}")
# random_sleep = random.uniform(2, 3)
# print(f"随机休眠 {random_sleep} 秒")
# time.sleep(random_sleep)
def get_imgs(client, folder_path, chapter_data):
img_links = []
for chapter_name, url in chapter_data.items():
try:
# 发送请求获取页面内容
response = client.get(url)
if response.status_code != 200:
raise Exception(f"无法访问 {url},状态码: {response.status_code}")
# 解析 HTML
soup = BeautifulSoup(response.text, 'html.parser')
# 获取图片的上一层元素
parent_element = soup.select_one(
'body > div.container > div.row.col-lg-12.col-md-12.col-xs-12')
if not parent_element:
raise Exception(f"{chapter_name} 未找到图片容器")
# 获取所有图片元素
img_elements = parent_element.select('img')
total_images = len(img_elements)
print(f'{chapter_name}{total_images} 张图片')
# 输出图片的 URL
for img in img_elements:
img_url = img.get('src')
if img_url:
img_links.append(img_url)
except Exception as e:
print(f"获取图片时出错: {e}")
raise # 抛出异常,触发重试逻辑
return img_links
def save_urls(folder_path, img_links):
# 定义保存文件路径
save_path = os.path.join(folder_path, 'img_links.txt')
# 将图片链接写入文件
with open(save_path, 'w', encoding='utf-8') as file:
for link in img_links:
file.write(link + '\n')
print(f"图片链接已保存到: {save_path}")
def new_folder(page_title):
# 获取当前脚本所在的目录
script_dir = os.path.dirname(os.path.abspath(__file__))
download_dir = os.path.join(script_dir, 'downloads')
if not os.path.exists(script_dir):
os.makedirs(script_dir)
if page_title:
# 拼接目标文件夹路径
folder_path = os.path.join(download_dir, page_title)
# 检查文件夹是否存在,如果不存在则创建
if not os.path.exists(folder_path):
os.makedirs(folder_path)
return folder_path
def get_chapter_data(client, target_url):
result = {}
page_title = ''
try:
response = client.get(target_url)
if response.status_code != 200:
raise Exception(f"无法访问 {target_url},状态码: {response.status_code}")
soup = BeautifulSoup(response.text, 'html.parser')
# 获取指定选择器下的所有元素
elements = soup.select(
'body > div.container > div:nth-child(3) > div:nth-child(2) a')
# 提取每个元素的 URL 和文本
for element in elements:
url = element.get('href')
text = element.get_text()
result[text] = base_url + url
except Exception as e:
print(f"获取章节数据时出错: {e}")
raise # 抛出异常,触发重试逻辑
return result
def main():
proxy_url = 'http://127.0.0.1:7890'
base_url = 'https://jcomic.net'
herf_url = '/eps/'
# 自定义请求头
custom_headers = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"accept-language": "zh-CN,zh;q=0.9",
"cache-control": "max-age=0",
"cookie": "_gid=GA1.2.724162267.1736817775; _ga_QL6YSDRWEV=GS1.1.1736837388.3.0.1736837388.0.0.0; _ga=GA1.2.1324234734.1736817774; _gat=1",
"priority": "u=0, i",
"sec-ch-ua": '"Google Chrome";v="131", "Chromium";v="131", "Not_A Brand";v="24"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": '"macOS"',
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "same-origin",
"sec-fetch-user": "?1",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
}
for comico_url in comico_urls:
target_url = base_url + herf_url + comico_url
print(target_url)
# 最大重试次数
max_retries = 999
retry_count = 0
while retry_count < max_retries:
try:
# 创建 httpx.Client 实例,并设置自定义请求头
with httpx.Client(proxies=proxy_url if use_proxy else None, headers=custom_headers) as client:
# 1, 获取页面章节数据
chapter_data = get_chapter_data(client, target_url)
print(chapter_data)
# 2, 在当前文件夹下创建一个文件夹,用来保存图片, 文件名称是 title
folder_path = new_folder(comico_url)
# 3, 遍历章节数据,获取img的链接
img_links = get_imgs(client, folder_path, chapter_data)
print(img_links)
# 4, 保存url到新建的文件夹中
save_urls(folder_path, img_links)
# 5,遍历 img_links ,将图片保存到 folder_path中, 保存的文件名类似 0001.png
save_img(client, folder_path, img_links)
# 如果成功执行完成,跳出循环
print('done!')
break
except Exception as e:
retry_count += 1
print(f"发生错误: {e},正在进行第 {retry_count} 次重试...")
if retry_count >= max_retries:
print("已达到最大重试次数,程序终止。")
break
# 固定延迟 10 分钟(600 秒)
delay = 30
print(f"等待 {delay} 秒后重试...")
time.sleep(delay)
if __name__ == '__main__':
main()

@ -0,0 +1,261 @@
# -*- coding: utf-8 -*-
import platform
import time
import random
from datetime import datetime
import re
import os
from pymongo import MongoClient
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import httpx
def browser_opt():
# 浏览器打开前, 设置浏览器
os_name = platform.system()
chrome_options = Options()
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-setuid-sandbox')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--headless') # 添加无头模式参数
# chrome_options.add_argument('--incognito') # 隐身模式(无痕模式)
# chrome_options.binary_location = r"C:\Program Files (x86)\Google\Chrome\Application\chrome.exe" # 手动指定使用的浏览器位置
if os_name == 'Linux':
chrome_options.add_argument('/usr/bin/chromium') # linux 必须指定chromium路径
else:
pass # 其他系统不需要指定路径
browser = webdriver.Chrome(options=chrome_options)
return browser
def browser_open(browser, url):
# 打开浏览器
browser.get(url)
time.sleep(random.uniform(1, 2))
return browser
def browser_get_page_source(browser):
# 获取当前页面源代码
return browser.page_source
def browser_find_by_selector(browser, selector):
# 通过 css 选择器搜素
try:
WebDriverWait(browser, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, selector)))
element = browser.find_element(By.CSS_SELECTOR, selector)
if not element:
return None
return element.text
except Exception as e:
print(e)
return None
def browser_screenshot(browser):
# 获取当前网页的标题
title = browser.title
# 获取当前时间的时间戳
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
# 构建文件名
filename = f"{title.replace(' ', '')}_{timestamp}.png"
# 保存截图
browser.save_screenshot(filename)
print(f"保存截图文件: {filename}")
def browser_close(browser):
browser.close()
def sanitize_filename(string):
# 替换Windows不允许的字符
allowed_chars = re.compile(r'[<>:"/\\|?*]')
sanitized_filename = allowed_chars.sub('', string)
# 替换空格为下划线
sanitized_filename = sanitized_filename.replace(' ', '_')
# 确保文件名不以点开头
if sanitized_filename.startswith('.'):
sanitized_filename = '_' + sanitized_filename[1:]
# 确保文件名不包含两个连续的点
sanitized_filename = sanitized_filename.replace('..', '.')
# 确保文件名不是空字符串
if not sanitized_filename:
sanitized_filename = 'noname' + '_' + str(int(time.time()))
return sanitized_filename
def task1():
browser = browser_opt()
print(f'正在打开浏览器')
browser = browser_open(browser, url)
print(f'前往 url: {url}')
page_source = browser_get_page_source(browser)
# 获取漫画名, 作为文件夹名
book_name = re.findall('meta property="og:novel:book_name" content="(.*?)"', page_source)
if book_name:
book_name = book_name[0]
book_name = sanitize_filename(book_name)
else:
print("获取漫画名称失败")
exit(0)
# 获取每一集的url
all_set = []
host = 'https://zcymh.com'
start_tag = '<ol class="chapter-list col-4 text" id="j_chapter_list">'
end_tag = '</ol>'
start_index = page_source.find(start_tag)
end_index = page_source.find(end_tag, start_index)
if start_index != -1 and end_index != -1:
target_element = page_source[start_index + len(start_tag):end_index]
pattern = r'<a title="(.*?)" href="(.*?)" target="_self">'
matches = re.findall(pattern, target_element)
set_num = 1
for match in matches:
title = sanitize_filename(match[0])
set_url = host + match[1]
# 观看顺序排序, 集名, 集url
all_set.append([str(set_num).zfill(4), title, set_url])
set_num += 1
# 循环每一集的 url, 拿到每集的每一个图片, 存到一个总列表里面
all_data_list = []
for set_data in all_set:
browser = browser_open(browser, set_data[2])
page_source = browser_get_page_source(browser)
page_list = re.findall('<img src="(.*?)" width', page_source)
print(f'正在获取 {set_data[1]}')
page_num = 1
for page in page_list:
# 此处是 db 或者 csv 的一行数据
all_data_list.append({
'comico_serial': set_data[0],
'set_name': set_data[1],
'page_num': page_num,
'set_url': set_data[2],
'img_url': page,
'is_download': 0,
})
page_num += 1
# 总列表储存所有数据, 存 mongodb
conn = MongoClient(mongodb_link)
db = conn[db_name]
collection = db[book_name]
for data in all_data_list:
data_exists = collection.find_one({"img_url": data['img_url']})
if data_exists is None:
try:
result = collection.insert_one(data)
print(f"数据插入成功,ObjectId: {data['comico_serial']}\t{data['set_name']}\t{data['page_num']}")
except Exception as e:
print(f"数据插入失败,错误信息: {e}")
else:
print(f'数据已存在: {data}')
comico_path = os.path.join(os.getcwd(), 'comico')
if not os.path.exists(comico_path):
os.makedirs(comico_path)
# 写完所有数据, 创建一个文件夹
file_path = os.path.join(comico_path, book_name)
if not os.path.exists(file_path):
os.mkdir(file_path)
browser_close(browser)
def task2():
file_path = os.path.join(os.getcwd(), 'comico', load_book_name)
if not os.path.exists(file_path):
os.mkdir(file_path)
client = MongoClient(mongodb_link)
db = client[db_name]
collection = db[load_book_name]
# 还原is_download
# for document in collection.find():
# collection.update_one({'_id': document['_id']}, {'$set': {'is_download': 0}})
# 读取集合中的所有文档
try:
for document in collection.find():
if document['is_download'] == 0:
# 执行你的代码
try:
resp = httpx.get(document['img_url'], headers=headers)
if resp.status_code != 200:
err = f'请求图片失败, 错误码: {resp.status_code}'
raise Exception(err)
set_file_name = document['comico_serial'] + '_' + sanitize_filename(document['set_name'])
if not os.path.exists(os.path.join(file_path, set_file_name)):
os.makedirs(os.path.join(file_path, set_file_name))
img_name = str(document['page_num']).zfill(4)
suffix = document['img_url'].split('.')[-1]
img_path = file_path + '/' + set_file_name + '/' + img_name + '.' + suffix
with open(img_path, 'wb') as f:
f.write(resp.content)
# 执行成功后,将is_download字段更新为1
collection.update_one({'_id': document['_id']}, {'$set': {'is_download': 1}})
print(f"已更新文档: {load_book_name}\t{document['comico_serial']}\t{document['set_name']}\t{document['page_num']}")
except Exception as e:
print(f"处理文档时发生错误:{e}")
else:
print("已下载,跳过")
except Exception as e:
print(f"读取集合时发生错误:{e}")
# 关闭数据库连接
client.close()
if __name__ == "__main__":
choose = 2
mongodb_link = 'mongodb://root:aaaAAA111!!!@192.168.31.177:38000/'
db_name = 'comico'
if choose == 1:
comico_id = '384'
url = 'https://zcymh.com/manben/{}/'.format(comico_id)
host = 'https://zcymh.com'
task1()
elif choose == 2:
load_book_name = '诚如神之所说'
headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0'}
task2()

@ -0,0 +1,45 @@
# -*- coding: utf-8 -*-
import os
from PIL import Image
current_dir_path = os.path.dirname(os.path.abspath(__file__))
# 设置起始目录
start_dir = os.path.join(current_dir_path, 'downloads')
# 遍历downloads文件夹
for root, dirs, files in os.walk(start_dir):
for dir in dirs:
sub_dir = os.path.join(root, dir)
for sub_root, sub_dirs, sub_files in os.walk(sub_dir):
for sub_sub_dir in sub_dirs:
sub_sub_dir_path = os.path.join(sub_root, sub_sub_dir)
print(sub_sub_dir_path)
png_count = 0
images = []
for file in os.listdir(sub_sub_dir_path):
if file.lower().endswith('.png'):
images.append(os.path.join(sub_sub_dir_path, file))
png_count += 1
if not images:
raise ValueError("图片列表不能为空")
total_image = Image.open(images[0])
for image in images[1:]:
img = Image.open(image)
new_image = Image.new('RGB', (max(total_image.width, img.width), total_image.height + img.height))
new_image.paste(total_image, (0, 0))
new_image.paste(img, (0, total_image.height))
total_image = new_image
total_image.save(f'{sub_sub_dir_path}.png')
break
break
break

@ -0,0 +1,204 @@
# -*- coding: utf-8 -*-
import os
import time
import httpx
import asyncio
import re
import sqlite3
from playwright.sync_api import sync_playwright
comico_id = '419025'
base_url = 'https://www.zhuimh.com'
target_href_url = 'https://www.zhuimh.com/comic/'
scroll_speed = 2
current_dir_path = os.path.dirname(os.path.abspath(__file__))
download_folder = os.path.join(current_dir_path, 'downloads')
if not os.path.exists(download_folder):
os.mkdir(download_folder)
db_path = os.path.join(download_folder, 'zhuimh.db')
txt_path = os.path.join(download_folder, 'target_comico_name.txt')
def create_db(title):
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
cursor.execute(
f'CREATE TABLE IF NOT EXISTS {title} (id INTEGER PRIMARY KEY AUTOINCREMENT, chapter_name TEXT, url TEXT, state BOOLEAN DEFAULT 0)'
)
conn.commit()
cursor.close()
conn.close()
def write_to_db(title, chapter_name, url):
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
# 检查chapter_name是否已存在
cursor.execute(
f'SELECT EXISTS(SELECT 1 FROM {title} WHERE chapter_name = ?)', (chapter_name,))
exists = cursor.fetchone()[0]
if not exists:
# 如果不存在,则插入新记录
cursor.execute(f'INSERT INTO {title} (chapter_name, url) VALUES (?, ?)', (chapter_name, url))
conn.commit()
cursor.close()
conn.close()
async def async_get_chapter_list():
async with httpx.AsyncClient() as client:
chapters_data = {}
response = await client.get(target_href_url + comico_id)
if response.status_code == 200:
text = response.text
title = re.findall(r'<h4>(.*?)</h4>', text)
title = title[0] if title else comico_id
print(title)
# 这里先创建一个 txt, 固定获取本次爬取的名称,下面读取数据库的时候, 需要通过这个名称作为表名读出来
with open(txt_path, 'w', encoding='utf-8') as f:
print('写入当前目标名称')
f.write(title)
chapters = re.findall(r'<li><a href="(.*?)">(.*?)</a></li>', text)
for chapter in chapters:
chapters_data[chapter[1]] = base_url + chapter[0]
# 创建 sqlite 将数据存起来
create_db(title)
for chapter_name, url in chapters_data.items():
write_to_db(title, chapter_name, url)
print('数据ok')
async def get_chapter_list():
await async_get_chapter_list()
def load_db(title):
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
cursor.execute(f'SELECT * FROM {title} WHERE state = 0 ORDER BY id ASC')
rows = cursor.fetchall()
cursor.close()
conn.close()
return rows
def change_db_data_state(data_id, t_name):
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
table_name = t_name
id_column = 'id'
id_value = data_id
bool_column = 'state'
sql = f'UPDATE {table_name} SET {bool_column} = 1 WHERE {id_column} = ?'
cursor.execute(sql, (id_value,))
conn.commit()
cursor.close()
conn.close()
def scroll_to_percentage(page):
# 滚动浏览器页面
percentage_list = [i for i in range(5, 101, scroll_speed)]
for percentage in percentage_list:
# 计算页面的指定百分比高度
height = page.evaluate("() => document.body.scrollHeight")
scroll_position = height * (percentage / 100)
# 跳转到指定的百分比位置
page.evaluate(f"window.scrollTo({0}, {scroll_position})")
time.sleep(0.5)
def request_chapter_data(title, data_id, chapter_name, chapter_url):
chapter_folder = os.path.join(current_dir_path, 'downloads', title, chapter_name)
with sync_playwright() as playwright:
try:
browser = playwright.chromium.launch(headless=True)
page = browser.new_page()
page.goto(chapter_url)
page.wait_for_load_state('networkidle')
except Exception as e:
print(e)
return False
# 滚动页面
print('开始滚动页面')
scroll_to_percentage(page)
page.evaluate("window.scrollTo({top: 0, behavior: 'smooth'})")
scroll_to_percentage(page)
print('滚动完成')
time.sleep(2)
# 检查一下是否所有的图片都加载完成, 如果没有, 直接退出函数, 然后重新打开浏览器
html_content = page.content()
check_list = re.findall('img class="lazy-read" src="(.*?)"', html_content)
for l in check_list:
if 'lazy-read.gif' in l:
return False
# 创建章节文件夹
if not os.path.exists(chapter_folder):
os.makedirs(chapter_folder)
# 获取匹配的元素数量
total_images = page.locator('.lazy-read').count()
for page_num in range(1, total_images+1):
img_locator = f'body > div.chpater-images > img:nth-child({page_num})'
img_path = os.path.join(chapter_folder, f'{str(page_num).zfill(3)}.png')
page.locator(img_locator).screenshot(path=img_path)
print(f'已下载 {img_path}')
page_num += 1
# 下载完当前章节后, 将state字段改为True
print(f'{chapter_name} 已下载完成\n\n')
change_db_data_state(data_id, title)
browser.close()
return True
def main():
asyncio.run(get_chapter_list())
# 开始对每一页的章节进行爬取
# 先读取当前的目标名称
title = ''
with open(txt_path, 'r', encoding='utf-8') as f:
title = f.read()
folder_name = os.path.join(download_folder, title)
if not os.path.exists(folder_name):
os.mkdir(folder_name)
for retry in range(999):
load_data = load_db(title)
if not load_data:
print('The database has no data or all done!')
exit(0)
for data in load_data:
ok = True
data_id = data[0]
chapter_name = data[1]
chapter_url = data[2]
print(f'准备获取图片: {title} {chapter_name}')
ok = request_chapter_data(title, data_id, chapter_name, chapter_url)
if not ok:
print(f'图片加载失败: {title} {chapter_name} 重试\n\n')
time.sleep(5)
break
if __name__ == "__main__":
main()
Loading…
Cancel
Save