commit
efcc609cc5
@ -0,0 +1,6 @@ |
|||||||
|
.DS_Store |
||||||
|
__pycache__/ |
||||||
|
*.pyc |
||||||
|
.idea |
||||||
|
|
||||||
|
*/downloads/* |
||||||
@ -0,0 +1,168 @@ |
|||||||
|
import time |
||||||
|
import re |
||||||
|
import os |
||||||
|
import sqlite3 |
||||||
|
import httpx |
||||||
|
from playwright.sync_api import sync_playwright |
||||||
|
|
||||||
|
current_dir_path = os.path.dirname(os.path.abspath(__file__)) |
||||||
|
|
||||||
|
comico_key = 'OMzNzNS' |
||||||
|
base_url = 'https://www.dumanwu.com' |
||||||
|
target_url = base_url + '/' + comico_key |
||||||
|
|
||||||
|
download_folder = os.path.join(current_dir_path, 'downloads') |
||||||
|
if not os.path.exists(download_folder): |
||||||
|
os.mkdir(download_folder) |
||||||
|
|
||||||
|
|
||||||
|
def write_db(title, db_path, chapter_folder_name, chapter_url): |
||||||
|
conn = sqlite3.connect(db_path) |
||||||
|
cursor = conn.cursor() |
||||||
|
cursor.execute( |
||||||
|
f'CREATE TABLE IF NOT EXISTS {title} (id INTEGER PRIMARY KEY AUTOINCREMENT, chapter_name TEXT, url TEXT, state BOOLEAN DEFAULT 0)' |
||||||
|
) |
||||||
|
conn.commit() |
||||||
|
|
||||||
|
# 检查chapter_name是否已存在 |
||||||
|
cursor.execute( |
||||||
|
f'SELECT EXISTS(SELECT 1 FROM {title} WHERE chapter_name = ?)', (chapter_folder_name,)) |
||||||
|
exists = cursor.fetchone()[0] |
||||||
|
|
||||||
|
if not exists: |
||||||
|
# 如果不存在,则插入新记录 |
||||||
|
cursor.execute(f'INSERT INTO {title} (chapter_name, url) VALUES (?, ?)', (chapter_folder_name, chapter_url)) |
||||||
|
conn.commit() |
||||||
|
|
||||||
|
cursor.close() |
||||||
|
conn.close() |
||||||
|
|
||||||
|
|
||||||
|
def load_db(title, db_path): |
||||||
|
conn = sqlite3.connect(db_path) |
||||||
|
cursor = conn.cursor() |
||||||
|
cursor.execute(f'SELECT * FROM {title} WHERE state = 0 ORDER BY id ASC') |
||||||
|
rows = cursor.fetchall() |
||||||
|
cursor.close() |
||||||
|
conn.close() |
||||||
|
return rows |
||||||
|
|
||||||
|
|
||||||
|
def fetch_page_title(target_url): |
||||||
|
with httpx.Client(verify=False) as client: # 设置不验证证书 |
||||||
|
response = client.get(target_url) |
||||||
|
if response.status_code != 200: |
||||||
|
print(f'Error: {response.status_code}') |
||||||
|
exit(0) |
||||||
|
title = re.findall(r'<p></p><h1 class="name_mh">(.*?)</h1><p></p>', response.text) |
||||||
|
if title: |
||||||
|
return title[0] |
||||||
|
else: |
||||||
|
print("Title not found") |
||||||
|
exit(0) |
||||||
|
|
||||||
|
|
||||||
|
def fetch_chapter_data(): |
||||||
|
with sync_playwright() as playwright: |
||||||
|
browser = playwright.chromium.launch( |
||||||
|
headless=True, |
||||||
|
args=['--ignore-certificate-errors'] |
||||||
|
) |
||||||
|
page = browser.new_page() |
||||||
|
page.goto(target_url) |
||||||
|
|
||||||
|
time.sleep(1) |
||||||
|
|
||||||
|
button_selector = 'body > div > div > div.forminfo > div.chapterList > div.chapterlistload > div > button' |
||||||
|
for i in range(3): |
||||||
|
try: |
||||||
|
page.click(button_selector) |
||||||
|
break |
||||||
|
except Exception as e: |
||||||
|
pass |
||||||
|
|
||||||
|
page.wait_for_timeout(1000) |
||||||
|
|
||||||
|
source = page.content() |
||||||
|
|
||||||
|
ul_list = re.findall('<ul>(.*?)</ul>', source, re.DOTALL) |
||||||
|
if len(ul_list) > 0: |
||||||
|
ul_list = ul_list[0] |
||||||
|
else: |
||||||
|
return False |
||||||
|
|
||||||
|
chapter_url_list = re.findall('<a href="(.*?)">', ul_list) |
||||||
|
chapter_name_list = re.findall('<li>(.*?)</li>', ul_list) |
||||||
|
|
||||||
|
chapter_url_list = chapter_url_list[::-1] |
||||||
|
chapter_name_list = chapter_name_list[::-1] |
||||||
|
|
||||||
|
result = {} |
||||||
|
|
||||||
|
chapter_count = 1 |
||||||
|
for chapter_name, chapter_url in zip(chapter_name_list, chapter_url_list): |
||||||
|
chapter_count_str = str(chapter_count).zfill(4) |
||||||
|
chapter_url = base_url + chapter_url |
||||||
|
result[chapter_count_str] = (chapter_name, chapter_url) |
||||||
|
chapter_count += 1 |
||||||
|
|
||||||
|
browser.close() |
||||||
|
|
||||||
|
return result |
||||||
|
|
||||||
|
|
||||||
|
def fetch_images(data, chapter_folder_name): |
||||||
|
data_id = data[0] |
||||||
|
chapter_url = data[2] |
||||||
|
with sync_playwright() as playwright: |
||||||
|
browser = playwright.chromium.launch( |
||||||
|
headless=False, |
||||||
|
args=['--ignore-certificate-errors'] |
||||||
|
) |
||||||
|
page = browser.new_page() |
||||||
|
page.goto(chapter_url) |
||||||
|
|
||||||
|
time.sleep(1) |
||||||
|
|
||||||
|
html_content = page.content() # 获取渲染后的整个页面HTML |
||||||
|
img_list = re.findall('<div class="main_img"><div class="chapter-img-box">([\S\s]*?)</a></div>', html_content) |
||||||
|
img_list = img_list[0] |
||||||
|
urls = re.findall('<img (src="|data-src=")(.*?)"', img_list) |
||||||
|
for url in urls: |
||||||
|
page.goto(url) |
||||||
|
browser.close() |
||||||
|
|
||||||
|
|
||||||
|
def main(): |
||||||
|
print(target_url) |
||||||
|
# ------------------------------ step1 ------------------------------ |
||||||
|
title = fetch_page_title(target_url) |
||||||
|
|
||||||
|
comico_folder = os.path.join(download_folder, title) |
||||||
|
if not os.path.exists(comico_folder): |
||||||
|
os.mkdir(comico_folder) |
||||||
|
|
||||||
|
# 创建 chapter db, 保存 chapter 数据 |
||||||
|
db_path = os.path.join(comico_folder, 'comico.db') |
||||||
|
|
||||||
|
# 获取章节的 title, url |
||||||
|
chapter_data = fetch_chapter_data() |
||||||
|
|
||||||
|
for k, v in chapter_data.items(): |
||||||
|
chapter_url = v[1] |
||||||
|
write_db(title, db_path, k + '_' + v[0], chapter_url) |
||||||
|
|
||||||
|
# ------------------------------ step2 ------------------------------ |
||||||
|
all_data = load_db(title, db_path) |
||||||
|
|
||||||
|
for data in all_data: |
||||||
|
chapter_folder_name = os.path.join(comico_folder, data[1]) |
||||||
|
if not os.path.exists(chapter_folder_name): |
||||||
|
os.mkdir(chapter_folder_name) |
||||||
|
|
||||||
|
fetch_images(data, chapter_folder_name) |
||||||
|
time.sleep(999) |
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__': |
||||||
|
main() |
||||||
@ -0,0 +1,205 @@ |
|||||||
|
import os |
||||||
|
import time |
||||||
|
import random |
||||||
|
import httpx |
||||||
|
from bs4 import BeautifulSoup |
||||||
|
|
||||||
|
comico_urls = [ |
||||||
|
'[PIXIV] LotteryFate (18900473)(AI)', |
||||||
|
] |
||||||
|
|
||||||
|
# 是否使用代理 |
||||||
|
use_proxy = 1 |
||||||
|
|
||||||
|
|
||||||
|
def save_img(client, folder_path, img_links): |
||||||
|
for index, img_url in enumerate(img_links, start=1): |
||||||
|
try: |
||||||
|
# 生成文件名,例如 0001.png, 0002.png |
||||||
|
file_name = f"{str(index).zfill(4)}.png" |
||||||
|
file_path = os.path.join(folder_path, file_name) |
||||||
|
|
||||||
|
# 检查文件是否已经存在 |
||||||
|
if os.path.exists(file_path): |
||||||
|
print(f"文件已存在,跳过下载: {file_path}") |
||||||
|
continue |
||||||
|
|
||||||
|
# 发送请求获取图片内容 |
||||||
|
response = client.get(img_url) |
||||||
|
if response.status_code != 200: |
||||||
|
raise Exception( |
||||||
|
f"无法下载图片 {img_url},状态码: {response.status_code}") |
||||||
|
|
||||||
|
# 保存图片到本地 |
||||||
|
with open(file_path, 'wb') as file: |
||||||
|
file.write(response.content) |
||||||
|
|
||||||
|
print(f"图片已保存: {file_path}") |
||||||
|
except Exception as e: |
||||||
|
raise Exception(f"下载图片 {img_url} 时出错: {e}") |
||||||
|
|
||||||
|
# random_sleep = random.uniform(2, 3) |
||||||
|
# print(f"随机休眠 {random_sleep} 秒") |
||||||
|
# time.sleep(random_sleep) |
||||||
|
|
||||||
|
|
||||||
|
def get_imgs(client, folder_path, chapter_data): |
||||||
|
img_links = [] |
||||||
|
for chapter_name, url in chapter_data.items(): |
||||||
|
try: |
||||||
|
# 发送请求获取页面内容 |
||||||
|
response = client.get(url) |
||||||
|
if response.status_code != 200: |
||||||
|
raise Exception(f"无法访问 {url},状态码: {response.status_code}") |
||||||
|
|
||||||
|
# 解析 HTML |
||||||
|
soup = BeautifulSoup(response.text, 'html.parser') |
||||||
|
|
||||||
|
# 获取图片的上一层元素 |
||||||
|
parent_element = soup.select_one( |
||||||
|
'body > div.container > div.row.col-lg-12.col-md-12.col-xs-12') |
||||||
|
if not parent_element: |
||||||
|
raise Exception(f"{chapter_name} 未找到图片容器") |
||||||
|
|
||||||
|
# 获取所有图片元素 |
||||||
|
img_elements = parent_element.select('img') |
||||||
|
total_images = len(img_elements) |
||||||
|
print(f'{chapter_name} 共 {total_images} 张图片') |
||||||
|
|
||||||
|
# 输出图片的 URL |
||||||
|
for img in img_elements: |
||||||
|
img_url = img.get('src') |
||||||
|
if img_url: |
||||||
|
img_links.append(img_url) |
||||||
|
except Exception as e: |
||||||
|
print(f"获取图片时出错: {e}") |
||||||
|
raise # 抛出异常,触发重试逻辑 |
||||||
|
return img_links |
||||||
|
|
||||||
|
|
||||||
|
def save_urls(folder_path, img_links): |
||||||
|
# 定义保存文件路径 |
||||||
|
save_path = os.path.join(folder_path, 'img_links.txt') |
||||||
|
|
||||||
|
# 将图片链接写入文件 |
||||||
|
with open(save_path, 'w', encoding='utf-8') as file: |
||||||
|
for link in img_links: |
||||||
|
file.write(link + '\n') |
||||||
|
|
||||||
|
print(f"图片链接已保存到: {save_path}") |
||||||
|
|
||||||
|
|
||||||
|
def new_folder(page_title): |
||||||
|
# 获取当前脚本所在的目录 |
||||||
|
script_dir = os.path.dirname(os.path.abspath(__file__)) |
||||||
|
download_dir = os.path.join(script_dir, 'downloads') |
||||||
|
if not os.path.exists(script_dir): |
||||||
|
os.makedirs(script_dir) |
||||||
|
|
||||||
|
if page_title: |
||||||
|
# 拼接目标文件夹路径 |
||||||
|
folder_path = os.path.join(download_dir, page_title) |
||||||
|
|
||||||
|
# 检查文件夹是否存在,如果不存在则创建 |
||||||
|
if not os.path.exists(folder_path): |
||||||
|
os.makedirs(folder_path) |
||||||
|
|
||||||
|
return folder_path |
||||||
|
|
||||||
|
|
||||||
|
def get_chapter_data(client, target_url): |
||||||
|
result = {} |
||||||
|
page_title = '' |
||||||
|
|
||||||
|
try: |
||||||
|
response = client.get(target_url) |
||||||
|
if response.status_code != 200: |
||||||
|
raise Exception(f"无法访问 {target_url},状态码: {response.status_code}") |
||||||
|
|
||||||
|
soup = BeautifulSoup(response.text, 'html.parser') |
||||||
|
|
||||||
|
# 获取指定选择器下的所有元素 |
||||||
|
elements = soup.select( |
||||||
|
'body > div.container > div:nth-child(3) > div:nth-child(2) a') |
||||||
|
|
||||||
|
# 提取每个元素的 URL 和文本 |
||||||
|
for element in elements: |
||||||
|
url = element.get('href') |
||||||
|
text = element.get_text() |
||||||
|
result[text] = base_url + url |
||||||
|
except Exception as e: |
||||||
|
print(f"获取章节数据时出错: {e}") |
||||||
|
raise # 抛出异常,触发重试逻辑 |
||||||
|
|
||||||
|
return result |
||||||
|
|
||||||
|
|
||||||
|
def main(): |
||||||
|
proxy_url = 'http://127.0.0.1:7890' |
||||||
|
base_url = 'https://jcomic.net' |
||||||
|
herf_url = '/eps/' |
||||||
|
# 自定义请求头 |
||||||
|
custom_headers = { |
||||||
|
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", |
||||||
|
"accept-language": "zh-CN,zh;q=0.9", |
||||||
|
"cache-control": "max-age=0", |
||||||
|
"cookie": "_gid=GA1.2.724162267.1736817775; _ga_QL6YSDRWEV=GS1.1.1736837388.3.0.1736837388.0.0.0; _ga=GA1.2.1324234734.1736817774; _gat=1", |
||||||
|
"priority": "u=0, i", |
||||||
|
"sec-ch-ua": '"Google Chrome";v="131", "Chromium";v="131", "Not_A Brand";v="24"', |
||||||
|
"sec-ch-ua-mobile": "?0", |
||||||
|
"sec-ch-ua-platform": '"macOS"', |
||||||
|
"sec-fetch-dest": "document", |
||||||
|
"sec-fetch-mode": "navigate", |
||||||
|
"sec-fetch-site": "same-origin", |
||||||
|
"sec-fetch-user": "?1", |
||||||
|
"upgrade-insecure-requests": "1", |
||||||
|
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36" |
||||||
|
} |
||||||
|
|
||||||
|
for comico_url in comico_urls: |
||||||
|
target_url = base_url + herf_url + comico_url |
||||||
|
print(target_url) |
||||||
|
# 最大重试次数 |
||||||
|
max_retries = 999 |
||||||
|
retry_count = 0 |
||||||
|
|
||||||
|
while retry_count < max_retries: |
||||||
|
try: |
||||||
|
# 创建 httpx.Client 实例,并设置自定义请求头 |
||||||
|
with httpx.Client(proxies=proxy_url if use_proxy else None, headers=custom_headers) as client: |
||||||
|
# 1, 获取页面章节数据 |
||||||
|
chapter_data = get_chapter_data(client, target_url) |
||||||
|
print(chapter_data) |
||||||
|
|
||||||
|
# 2, 在当前文件夹下创建一个文件夹,用来保存图片, 文件名称是 title |
||||||
|
folder_path = new_folder(comico_url) |
||||||
|
|
||||||
|
# 3, 遍历章节数据,获取img的链接 |
||||||
|
img_links = get_imgs(client, folder_path, chapter_data) |
||||||
|
print(img_links) |
||||||
|
|
||||||
|
# 4, 保存url到新建的文件夹中 |
||||||
|
save_urls(folder_path, img_links) |
||||||
|
|
||||||
|
# 5,遍历 img_links ,将图片保存到 folder_path中, 保存的文件名类似 0001.png |
||||||
|
save_img(client, folder_path, img_links) |
||||||
|
|
||||||
|
# 如果成功执行完成,跳出循环 |
||||||
|
print('done!') |
||||||
|
break |
||||||
|
|
||||||
|
except Exception as e: |
||||||
|
retry_count += 1 |
||||||
|
print(f"发生错误: {e},正在进行第 {retry_count} 次重试...") |
||||||
|
if retry_count >= max_retries: |
||||||
|
print("已达到最大重试次数,程序终止。") |
||||||
|
break |
||||||
|
|
||||||
|
# 固定延迟 10 分钟(600 秒) |
||||||
|
delay = 30 |
||||||
|
print(f"等待 {delay} 秒后重试...") |
||||||
|
time.sleep(delay) |
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__': |
||||||
|
main() |
||||||
@ -0,0 +1,261 @@ |
|||||||
|
# -*- coding: utf-8 -*- |
||||||
|
|
||||||
|
import platform |
||||||
|
import time |
||||||
|
import random |
||||||
|
from datetime import datetime |
||||||
|
import re |
||||||
|
import os |
||||||
|
from pymongo import MongoClient |
||||||
|
|
||||||
|
from selenium import webdriver |
||||||
|
from selenium.webdriver.chrome.options import Options |
||||||
|
from selenium.webdriver.support.ui import WebDriverWait |
||||||
|
from selenium.webdriver.support import expected_conditions as EC |
||||||
|
from selenium.webdriver.common.by import By |
||||||
|
import httpx |
||||||
|
|
||||||
|
|
||||||
|
def browser_opt(): |
||||||
|
# 浏览器打开前, 设置浏览器 |
||||||
|
os_name = platform.system() |
||||||
|
chrome_options = Options() |
||||||
|
chrome_options.add_argument('--no-sandbox') |
||||||
|
chrome_options.add_argument('--disable-setuid-sandbox') |
||||||
|
chrome_options.add_argument('--disable-gpu') |
||||||
|
chrome_options.add_argument('--headless') # 添加无头模式参数 |
||||||
|
# chrome_options.add_argument('--incognito') # 隐身模式(无痕模式) |
||||||
|
# chrome_options.binary_location = r"C:\Program Files (x86)\Google\Chrome\Application\chrome.exe" # 手动指定使用的浏览器位置 |
||||||
|
|
||||||
|
if os_name == 'Linux': |
||||||
|
chrome_options.add_argument('/usr/bin/chromium') # linux 必须指定chromium路径 |
||||||
|
else: |
||||||
|
pass # 其他系统不需要指定路径 |
||||||
|
|
||||||
|
browser = webdriver.Chrome(options=chrome_options) |
||||||
|
|
||||||
|
return browser |
||||||
|
|
||||||
|
|
||||||
|
def browser_open(browser, url): |
||||||
|
# 打开浏览器 |
||||||
|
browser.get(url) |
||||||
|
time.sleep(random.uniform(1, 2)) |
||||||
|
return browser |
||||||
|
|
||||||
|
|
||||||
|
def browser_get_page_source(browser): |
||||||
|
# 获取当前页面源代码 |
||||||
|
return browser.page_source |
||||||
|
|
||||||
|
|
||||||
|
def browser_find_by_selector(browser, selector): |
||||||
|
# 通过 css 选择器搜素 |
||||||
|
try: |
||||||
|
WebDriverWait(browser, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, selector))) |
||||||
|
element = browser.find_element(By.CSS_SELECTOR, selector) |
||||||
|
if not element: |
||||||
|
return None |
||||||
|
return element.text |
||||||
|
except Exception as e: |
||||||
|
print(e) |
||||||
|
return None |
||||||
|
|
||||||
|
|
||||||
|
def browser_screenshot(browser): |
||||||
|
# 获取当前网页的标题 |
||||||
|
title = browser.title |
||||||
|
# 获取当前时间的时间戳 |
||||||
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") |
||||||
|
# 构建文件名 |
||||||
|
filename = f"{title.replace(' ', '')}_{timestamp}.png" |
||||||
|
# 保存截图 |
||||||
|
browser.save_screenshot(filename) |
||||||
|
print(f"保存截图文件: {filename}") |
||||||
|
|
||||||
|
|
||||||
|
def browser_close(browser): |
||||||
|
browser.close() |
||||||
|
|
||||||
|
|
||||||
|
def sanitize_filename(string): |
||||||
|
# 替换Windows不允许的字符 |
||||||
|
allowed_chars = re.compile(r'[<>:"/\\|?*]') |
||||||
|
sanitized_filename = allowed_chars.sub('', string) |
||||||
|
|
||||||
|
# 替换空格为下划线 |
||||||
|
sanitized_filename = sanitized_filename.replace(' ', '_') |
||||||
|
|
||||||
|
# 确保文件名不以点开头 |
||||||
|
if sanitized_filename.startswith('.'): |
||||||
|
sanitized_filename = '_' + sanitized_filename[1:] |
||||||
|
|
||||||
|
# 确保文件名不包含两个连续的点 |
||||||
|
sanitized_filename = sanitized_filename.replace('..', '.') |
||||||
|
|
||||||
|
# 确保文件名不是空字符串 |
||||||
|
if not sanitized_filename: |
||||||
|
sanitized_filename = 'noname' + '_' + str(int(time.time())) |
||||||
|
|
||||||
|
return sanitized_filename |
||||||
|
|
||||||
|
|
||||||
|
def task1(): |
||||||
|
browser = browser_opt() |
||||||
|
print(f'正在打开浏览器') |
||||||
|
browser = browser_open(browser, url) |
||||||
|
print(f'前往 url: {url}') |
||||||
|
|
||||||
|
page_source = browser_get_page_source(browser) |
||||||
|
|
||||||
|
# 获取漫画名, 作为文件夹名 |
||||||
|
book_name = re.findall('meta property="og:novel:book_name" content="(.*?)"', page_source) |
||||||
|
if book_name: |
||||||
|
book_name = book_name[0] |
||||||
|
|
||||||
|
book_name = sanitize_filename(book_name) |
||||||
|
else: |
||||||
|
print("获取漫画名称失败") |
||||||
|
exit(0) |
||||||
|
|
||||||
|
# 获取每一集的url |
||||||
|
all_set = [] |
||||||
|
|
||||||
|
host = 'https://zcymh.com' |
||||||
|
|
||||||
|
start_tag = '<ol class="chapter-list col-4 text" id="j_chapter_list">' |
||||||
|
end_tag = '</ol>' |
||||||
|
start_index = page_source.find(start_tag) |
||||||
|
end_index = page_source.find(end_tag, start_index) |
||||||
|
if start_index != -1 and end_index != -1: |
||||||
|
target_element = page_source[start_index + len(start_tag):end_index] |
||||||
|
pattern = r'<a title="(.*?)" href="(.*?)" target="_self">' |
||||||
|
matches = re.findall(pattern, target_element) |
||||||
|
set_num = 1 |
||||||
|
for match in matches: |
||||||
|
title = sanitize_filename(match[0]) |
||||||
|
set_url = host + match[1] |
||||||
|
# 观看顺序排序, 集名, 集url |
||||||
|
all_set.append([str(set_num).zfill(4), title, set_url]) |
||||||
|
set_num += 1 |
||||||
|
|
||||||
|
# 循环每一集的 url, 拿到每集的每一个图片, 存到一个总列表里面 |
||||||
|
all_data_list = [] |
||||||
|
for set_data in all_set: |
||||||
|
browser = browser_open(browser, set_data[2]) |
||||||
|
|
||||||
|
page_source = browser_get_page_source(browser) |
||||||
|
page_list = re.findall('<img src="(.*?)" width', page_source) |
||||||
|
print(f'正在获取 {set_data[1]}') |
||||||
|
page_num = 1 |
||||||
|
for page in page_list: |
||||||
|
# 此处是 db 或者 csv 的一行数据 |
||||||
|
all_data_list.append({ |
||||||
|
'comico_serial': set_data[0], |
||||||
|
'set_name': set_data[1], |
||||||
|
'page_num': page_num, |
||||||
|
'set_url': set_data[2], |
||||||
|
'img_url': page, |
||||||
|
'is_download': 0, |
||||||
|
}) |
||||||
|
page_num += 1 |
||||||
|
|
||||||
|
# 总列表储存所有数据, 存 mongodb |
||||||
|
conn = MongoClient(mongodb_link) |
||||||
|
db = conn[db_name] |
||||||
|
collection = db[book_name] |
||||||
|
|
||||||
|
for data in all_data_list: |
||||||
|
data_exists = collection.find_one({"img_url": data['img_url']}) |
||||||
|
if data_exists is None: |
||||||
|
try: |
||||||
|
result = collection.insert_one(data) |
||||||
|
print(f"数据插入成功,ObjectId: {data['comico_serial']}\t{data['set_name']}\t{data['page_num']}") |
||||||
|
except Exception as e: |
||||||
|
print(f"数据插入失败,错误信息: {e}") |
||||||
|
else: |
||||||
|
print(f'数据已存在: {data}') |
||||||
|
|
||||||
|
|
||||||
|
comico_path = os.path.join(os.getcwd(), 'comico') |
||||||
|
if not os.path.exists(comico_path): |
||||||
|
os.makedirs(comico_path) |
||||||
|
|
||||||
|
# 写完所有数据, 创建一个文件夹 |
||||||
|
file_path = os.path.join(comico_path, book_name) |
||||||
|
if not os.path.exists(file_path): |
||||||
|
os.mkdir(file_path) |
||||||
|
|
||||||
|
browser_close(browser) |
||||||
|
|
||||||
|
|
||||||
|
def task2(): |
||||||
|
file_path = os.path.join(os.getcwd(), 'comico', load_book_name) |
||||||
|
|
||||||
|
if not os.path.exists(file_path): |
||||||
|
os.mkdir(file_path) |
||||||
|
|
||||||
|
client = MongoClient(mongodb_link) |
||||||
|
|
||||||
|
db = client[db_name] |
||||||
|
|
||||||
|
collection = db[load_book_name] |
||||||
|
|
||||||
|
# 还原is_download |
||||||
|
# for document in collection.find(): |
||||||
|
# collection.update_one({'_id': document['_id']}, {'$set': {'is_download': 0}}) |
||||||
|
|
||||||
|
# 读取集合中的所有文档 |
||||||
|
try: |
||||||
|
for document in collection.find(): |
||||||
|
if document['is_download'] == 0: |
||||||
|
# 执行你的代码 |
||||||
|
try: |
||||||
|
resp = httpx.get(document['img_url'], headers=headers) |
||||||
|
if resp.status_code != 200: |
||||||
|
err = f'请求图片失败, 错误码: {resp.status_code}' |
||||||
|
raise Exception(err) |
||||||
|
|
||||||
|
set_file_name = document['comico_serial'] + '_' + sanitize_filename(document['set_name']) |
||||||
|
|
||||||
|
if not os.path.exists(os.path.join(file_path, set_file_name)): |
||||||
|
os.makedirs(os.path.join(file_path, set_file_name)) |
||||||
|
|
||||||
|
img_name = str(document['page_num']).zfill(4) |
||||||
|
|
||||||
|
suffix = document['img_url'].split('.')[-1] |
||||||
|
|
||||||
|
img_path = file_path + '/' + set_file_name + '/' + img_name + '.' + suffix |
||||||
|
|
||||||
|
with open(img_path, 'wb') as f: |
||||||
|
f.write(resp.content) |
||||||
|
|
||||||
|
# 执行成功后,将is_download字段更新为1 |
||||||
|
collection.update_one({'_id': document['_id']}, {'$set': {'is_download': 1}}) |
||||||
|
print(f"已更新文档: {load_book_name}\t{document['comico_serial']}\t{document['set_name']}\t{document['page_num']}") |
||||||
|
except Exception as e: |
||||||
|
print(f"处理文档时发生错误:{e}") |
||||||
|
else: |
||||||
|
print("已下载,跳过") |
||||||
|
except Exception as e: |
||||||
|
print(f"读取集合时发生错误:{e}") |
||||||
|
|
||||||
|
# 关闭数据库连接 |
||||||
|
client.close() |
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__": |
||||||
|
choose = 2 |
||||||
|
|
||||||
|
mongodb_link = 'mongodb://root:aaaAAA111!!!@192.168.31.177:38000/' |
||||||
|
db_name = 'comico' |
||||||
|
|
||||||
|
if choose == 1: |
||||||
|
comico_id = '384' |
||||||
|
url = 'https://zcymh.com/manben/{}/'.format(comico_id) |
||||||
|
host = 'https://zcymh.com' |
||||||
|
task1() |
||||||
|
elif choose == 2: |
||||||
|
load_book_name = '诚如神之所说' |
||||||
|
headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0'} |
||||||
|
task2() |
||||||
@ -0,0 +1,45 @@ |
|||||||
|
# -*- coding: utf-8 -*- |
||||||
|
import os |
||||||
|
from PIL import Image |
||||||
|
|
||||||
|
current_dir_path = os.path.dirname(os.path.abspath(__file__)) |
||||||
|
# 设置起始目录 |
||||||
|
start_dir = os.path.join(current_dir_path, 'downloads') |
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# 遍历downloads文件夹 |
||||||
|
for root, dirs, files in os.walk(start_dir): |
||||||
|
for dir in dirs: |
||||||
|
sub_dir = os.path.join(root, dir) |
||||||
|
for sub_root, sub_dirs, sub_files in os.walk(sub_dir): |
||||||
|
for sub_sub_dir in sub_dirs: |
||||||
|
sub_sub_dir_path = os.path.join(sub_root, sub_sub_dir) |
||||||
|
print(sub_sub_dir_path) |
||||||
|
png_count = 0 |
||||||
|
images = [] |
||||||
|
for file in os.listdir(sub_sub_dir_path): |
||||||
|
if file.lower().endswith('.png'): |
||||||
|
images.append(os.path.join(sub_sub_dir_path, file)) |
||||||
|
png_count += 1 |
||||||
|
|
||||||
|
if not images: |
||||||
|
raise ValueError("图片列表不能为空") |
||||||
|
|
||||||
|
total_image = Image.open(images[0]) |
||||||
|
|
||||||
|
for image in images[1:]: |
||||||
|
img = Image.open(image) |
||||||
|
|
||||||
|
new_image = Image.new('RGB', (max(total_image.width, img.width), total_image.height + img.height)) |
||||||
|
|
||||||
|
new_image.paste(total_image, (0, 0)) |
||||||
|
new_image.paste(img, (0, total_image.height)) |
||||||
|
|
||||||
|
total_image = new_image |
||||||
|
|
||||||
|
total_image.save(f'{sub_sub_dir_path}.png') |
||||||
|
break |
||||||
|
break |
||||||
|
break |
||||||
|
|
||||||
@ -0,0 +1,204 @@ |
|||||||
|
# -*- coding: utf-8 -*- |
||||||
|
import os |
||||||
|
import time |
||||||
|
import httpx |
||||||
|
import asyncio |
||||||
|
import re |
||||||
|
import sqlite3 |
||||||
|
from playwright.sync_api import sync_playwright |
||||||
|
|
||||||
|
comico_id = '419025' |
||||||
|
base_url = 'https://www.zhuimh.com' |
||||||
|
target_href_url = 'https://www.zhuimh.com/comic/' |
||||||
|
scroll_speed = 2 |
||||||
|
current_dir_path = os.path.dirname(os.path.abspath(__file__)) |
||||||
|
|
||||||
|
download_folder = os.path.join(current_dir_path, 'downloads') |
||||||
|
if not os.path.exists(download_folder): |
||||||
|
os.mkdir(download_folder) |
||||||
|
|
||||||
|
db_path = os.path.join(download_folder, 'zhuimh.db') |
||||||
|
txt_path = os.path.join(download_folder, 'target_comico_name.txt') |
||||||
|
|
||||||
|
def create_db(title): |
||||||
|
conn = sqlite3.connect(db_path) |
||||||
|
cursor = conn.cursor() |
||||||
|
cursor.execute( |
||||||
|
f'CREATE TABLE IF NOT EXISTS {title} (id INTEGER PRIMARY KEY AUTOINCREMENT, chapter_name TEXT, url TEXT, state BOOLEAN DEFAULT 0)' |
||||||
|
) |
||||||
|
conn.commit() |
||||||
|
cursor.close() |
||||||
|
conn.close() |
||||||
|
|
||||||
|
|
||||||
|
def write_to_db(title, chapter_name, url): |
||||||
|
conn = sqlite3.connect(db_path) |
||||||
|
cursor = conn.cursor() |
||||||
|
|
||||||
|
# 检查chapter_name是否已存在 |
||||||
|
cursor.execute( |
||||||
|
f'SELECT EXISTS(SELECT 1 FROM {title} WHERE chapter_name = ?)', (chapter_name,)) |
||||||
|
exists = cursor.fetchone()[0] |
||||||
|
|
||||||
|
if not exists: |
||||||
|
# 如果不存在,则插入新记录 |
||||||
|
cursor.execute(f'INSERT INTO {title} (chapter_name, url) VALUES (?, ?)', (chapter_name, url)) |
||||||
|
conn.commit() |
||||||
|
|
||||||
|
cursor.close() |
||||||
|
conn.close() |
||||||
|
|
||||||
|
|
||||||
|
async def async_get_chapter_list(): |
||||||
|
async with httpx.AsyncClient() as client: |
||||||
|
chapters_data = {} |
||||||
|
response = await client.get(target_href_url + comico_id) |
||||||
|
if response.status_code == 200: |
||||||
|
text = response.text |
||||||
|
title = re.findall(r'<h4>(.*?)</h4>', text) |
||||||
|
title = title[0] if title else comico_id |
||||||
|
print(title) |
||||||
|
# 这里先创建一个 txt, 固定获取本次爬取的名称,下面读取数据库的时候, 需要通过这个名称作为表名读出来 |
||||||
|
with open(txt_path, 'w', encoding='utf-8') as f: |
||||||
|
print('写入当前目标名称') |
||||||
|
f.write(title) |
||||||
|
|
||||||
|
chapters = re.findall(r'<li><a href="(.*?)">(.*?)</a></li>', text) |
||||||
|
for chapter in chapters: |
||||||
|
chapters_data[chapter[1]] = base_url + chapter[0] |
||||||
|
|
||||||
|
# 创建 sqlite 将数据存起来 |
||||||
|
create_db(title) |
||||||
|
|
||||||
|
for chapter_name, url in chapters_data.items(): |
||||||
|
write_to_db(title, chapter_name, url) |
||||||
|
print('数据ok') |
||||||
|
|
||||||
|
|
||||||
|
async def get_chapter_list(): |
||||||
|
await async_get_chapter_list() |
||||||
|
|
||||||
|
|
||||||
|
def load_db(title): |
||||||
|
conn = sqlite3.connect(db_path) |
||||||
|
cursor = conn.cursor() |
||||||
|
cursor.execute(f'SELECT * FROM {title} WHERE state = 0 ORDER BY id ASC') |
||||||
|
rows = cursor.fetchall() |
||||||
|
cursor.close() |
||||||
|
conn.close() |
||||||
|
return rows |
||||||
|
|
||||||
|
|
||||||
|
def change_db_data_state(data_id, t_name): |
||||||
|
conn = sqlite3.connect(db_path) |
||||||
|
cursor = conn.cursor() |
||||||
|
table_name = t_name |
||||||
|
id_column = 'id' |
||||||
|
id_value = data_id |
||||||
|
bool_column = 'state' |
||||||
|
sql = f'UPDATE {table_name} SET {bool_column} = 1 WHERE {id_column} = ?' |
||||||
|
cursor.execute(sql, (id_value,)) |
||||||
|
conn.commit() |
||||||
|
cursor.close() |
||||||
|
conn.close() |
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def scroll_to_percentage(page): |
||||||
|
# 滚动浏览器页面 |
||||||
|
percentage_list = [i for i in range(5, 101, scroll_speed)] |
||||||
|
for percentage in percentage_list: |
||||||
|
# 计算页面的指定百分比高度 |
||||||
|
height = page.evaluate("() => document.body.scrollHeight") |
||||||
|
scroll_position = height * (percentage / 100) |
||||||
|
# 跳转到指定的百分比位置 |
||||||
|
page.evaluate(f"window.scrollTo({0}, {scroll_position})") |
||||||
|
time.sleep(0.5) |
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def request_chapter_data(title, data_id, chapter_name, chapter_url): |
||||||
|
chapter_folder = os.path.join(current_dir_path, 'downloads', title, chapter_name) |
||||||
|
with sync_playwright() as playwright: |
||||||
|
try: |
||||||
|
browser = playwright.chromium.launch(headless=True) |
||||||
|
page = browser.new_page() |
||||||
|
page.goto(chapter_url) |
||||||
|
page.wait_for_load_state('networkidle') |
||||||
|
except Exception as e: |
||||||
|
print(e) |
||||||
|
return False |
||||||
|
|
||||||
|
# 滚动页面 |
||||||
|
print('开始滚动页面') |
||||||
|
scroll_to_percentage(page) |
||||||
|
page.evaluate("window.scrollTo({top: 0, behavior: 'smooth'})") |
||||||
|
scroll_to_percentage(page) |
||||||
|
print('滚动完成') |
||||||
|
time.sleep(2) |
||||||
|
|
||||||
|
# 检查一下是否所有的图片都加载完成, 如果没有, 直接退出函数, 然后重新打开浏览器 |
||||||
|
html_content = page.content() |
||||||
|
check_list = re.findall('img class="lazy-read" src="(.*?)"', html_content) |
||||||
|
for l in check_list: |
||||||
|
if 'lazy-read.gif' in l: |
||||||
|
return False |
||||||
|
|
||||||
|
# 创建章节文件夹 |
||||||
|
if not os.path.exists(chapter_folder): |
||||||
|
os.makedirs(chapter_folder) |
||||||
|
|
||||||
|
# 获取匹配的元素数量 |
||||||
|
total_images = page.locator('.lazy-read').count() |
||||||
|
|
||||||
|
for page_num in range(1, total_images+1): |
||||||
|
img_locator = f'body > div.chpater-images > img:nth-child({page_num})' |
||||||
|
img_path = os.path.join(chapter_folder, f'{str(page_num).zfill(3)}.png') |
||||||
|
page.locator(img_locator).screenshot(path=img_path) |
||||||
|
print(f'已下载 {img_path}') |
||||||
|
page_num += 1 |
||||||
|
|
||||||
|
# 下载完当前章节后, 将state字段改为True |
||||||
|
print(f'{chapter_name} 已下载完成\n\n') |
||||||
|
change_db_data_state(data_id, title) |
||||||
|
|
||||||
|
browser.close() |
||||||
|
return True |
||||||
|
|
||||||
|
|
||||||
|
def main(): |
||||||
|
asyncio.run(get_chapter_list()) |
||||||
|
|
||||||
|
# 开始对每一页的章节进行爬取 |
||||||
|
# 先读取当前的目标名称 |
||||||
|
title = '' |
||||||
|
with open(txt_path, 'r', encoding='utf-8') as f: |
||||||
|
title = f.read() |
||||||
|
|
||||||
|
folder_name = os.path.join(download_folder, title) |
||||||
|
if not os.path.exists(folder_name): |
||||||
|
os.mkdir(folder_name) |
||||||
|
|
||||||
|
for retry in range(999): |
||||||
|
load_data = load_db(title) |
||||||
|
|
||||||
|
if not load_data: |
||||||
|
print('The database has no data or all done!') |
||||||
|
exit(0) |
||||||
|
|
||||||
|
for data in load_data: |
||||||
|
ok = True |
||||||
|
data_id = data[0] |
||||||
|
chapter_name = data[1] |
||||||
|
chapter_url = data[2] |
||||||
|
print(f'准备获取图片: {title} {chapter_name}') |
||||||
|
ok = request_chapter_data(title, data_id, chapter_name, chapter_url) |
||||||
|
if not ok: |
||||||
|
print(f'图片加载失败: {title} {chapter_name} 重试\n\n') |
||||||
|
time.sleep(5) |
||||||
|
break |
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__": |
||||||
|
main() |
||||||
Loading…
Reference in new issue