From f557aed15a5787c371123ffbbe36311bef8459c7 Mon Sep 17 00:00:00 2001 From: jack Date: Wed, 12 Nov 2025 16:34:05 +0800 Subject: [PATCH] first commit --- .gitignore | 67 ++++++++ Dockerfile | 17 ++ Readme.md | 194 +++++++++++++++++++++ build.sh | 88 ++++++++++ config.py | 92 ++++++++++ docker-compose.yaml | 11 ++ downloader.py | 62 +++++++ logger.py | 88 ++++++++++ main.py | 305 +++++++++++++++++++++++++++++++++ performance.py | 81 +++++++++ realtime_logger.py | 117 +++++++++++++ requirements.txt | 32 ++++ src/__init__.py | 2 + src/core/__init__.py | 2 + src/core/step1.py | 83 +++++++++ src/core/step2.py | 63 +++++++ src/logging/__init__.py | 2 + src/logging/logger.py | 59 +++++++ src/logging/realtime_logger.py | 77 +++++++++ src/services/__init__.py | 2 + src/services/downloader.py | 43 +++++ src/utils/__init__.py | 2 + src/utils/performance.py | 57 ++++++ src/utils/utils.py | 25 +++ start.py | 40 +++++ static/favicon.ico | Bin 0 -> 4286 bytes static/script.js | 300 ++++++++++++++++++++++++++++++++ static/style.css | 217 +++++++++++++++++++++++ step1.py | 207 ++++++++++++++++++++++ step2.py | 187 ++++++++++++++++++++ templates/index.html | 75 ++++++++ utils.py | 35 ++++ 32 files changed, 2632 insertions(+) create mode 100644 .gitignore create mode 100644 Dockerfile create mode 100644 Readme.md create mode 100644 build.sh create mode 100644 config.py create mode 100644 docker-compose.yaml create mode 100644 downloader.py create mode 100644 logger.py create mode 100644 main.py create mode 100644 performance.py create mode 100644 realtime_logger.py create mode 100644 requirements.txt create mode 100644 src/__init__.py create mode 100644 src/core/__init__.py create mode 100644 src/core/step1.py create mode 100644 src/core/step2.py create mode 100644 src/logging/__init__.py create mode 100644 src/logging/logger.py create mode 100644 src/logging/realtime_logger.py create mode 100644 src/services/__init__.py create mode 100644 src/services/downloader.py create mode 100644 src/utils/__init__.py create mode 100644 src/utils/performance.py create mode 100644 src/utils/utils.py create mode 100644 start.py create mode 100644 static/favicon.ico create mode 100644 static/script.js create mode 100644 static/style.css create mode 100644 step1.py create mode 100644 step2.py create mode 100644 templates/index.html create mode 100644 utils.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..319b7b6 --- /dev/null +++ b/.gitignore @@ -0,0 +1,67 @@ +.DS_Store +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +env/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +*.egg-info/ +.installed.cfg +*.egg +.idea/* +xml_files/ + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*,cover + +# Translations +*.mo +*.pot + +# Django stuff: +*.log + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +other/split_clash_config/split_config +ai_news/save_data +daily/*.txt +data/ +eh.tar \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..08cd082 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,17 @@ +FROM python:3.11-slim + +WORKDIR /app + +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt \ + -i https://pypi.tuna.tsinghua.edu.cn/simple \ + --trusted-host pypi.tuna.tsinghua.edu.cn + +COPY . . + +RUN mkdir -p data/downloads \ + && touch data/targets.txt data/proxy.txt \ + && chmod -R 755 data + +EXPOSE 8000 +CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"] \ No newline at end of file diff --git a/Readme.md b/Readme.md new file mode 100644 index 0000000..1acc78d --- /dev/null +++ b/Readme.md @@ -0,0 +1,194 @@ +# EH-Downloader + +一个基于 FastAPI 的 E-Hentai 画廊下载工具,支持异步批量下载画廊图片。 + +## 功能特性 + +- 🚀 **异步下载**: 基于 asyncio 的高性能异步下载 +- 🌐 **Web界面**: 现代化的 Web 用户界面 +- 🔧 **代理支持**: 支持 HTTP 代理配置 +- 📁 **智能管理**: 自动创建目录结构,按画廊分文件夹存储 +- 🔄 **断点续传**: 支持中断后继续下载 +- 📊 **进度监控**: 实时显示下载进度和状态 +- 🧹 **自动清理**: 一键清理临时文件和日志 + +## 快速开始 + +### 环境要求 + +- Python 3.11+ +- 网络代理(可选,用于访问 E-Hentai) + +### 安装依赖 + +```bash +pip install -r requirements.txt +``` + +### 运行应用 + +```bash +python main.py +``` + +访问 `http://localhost:8000` 使用 Web 界面。 + +### Docker 部署 + +```bash +# 构建镜像 +docker build -t eh-downloader . + +# 运行容器 +docker-compose up -d +``` + +## 使用说明 + +### 1. 配置代理 + +在项目根目录的 `proxy.txt` 文件中添加代理配置,每行一个: + +``` +127.0.0.1:7890 +192.168.1.100:8080 +``` + +### 2. 添加目标URL + +在 `data/targets.txt` 文件中添加要下载的画廊URL,每行一个: + +``` +https://e-hentai.org/g/1234567/abcdef123456 +https://e-hentai.org/g/2345678/bcdefg234567 +``` + +### 3. 开始下载 + +1. 打开 Web 界面 +2. 选择代理设置 +3. 点击"读取目标URL"加载URL列表 +4. 点击"下载URL"抓取画廊链接 +5. 点击"下载图片"开始下载图片 + +## 项目结构 + +``` +ehentai-fastapi/ +├── main.py # 主应用文件 +├── config.py # 配置管理 +├── logger.py # 日志管理 +├── utils.py # 工具函数 +├── step1.py # 画廊链接抓取 +├── step2.py # 图片下载 +├── downloader.py # 下载器类 +├── templates/ # HTML模板 +├── static/ # 静态资源 +├── data/ # 数据目录 +│ ├── targets.txt # 目标URL列表 +│ ├── downloads/ # 下载文件存储 +│ └── *.log # 日志文件 +├── proxy.txt # 代理配置 +├── requirements.txt # 依赖列表 +├── Dockerfile # Docker配置 +└── docker-compose.yaml # Docker Compose配置 +``` + +## 配置说明 + +### 应用配置 (config.py) + +- `concurrency`: 并发数,默认20 +- `max_page`: 单专辑最大翻页数,默认100 +- `retry_per_page`: 单页重试次数,默认5 +- `retry_per_image`: 单图重试次数,默认3 +- `timeout`: 请求超时时间,默认10秒 +- `image_timeout`: 图片下载超时时间,默认15秒 + +### 日志配置 + +- 日志级别:INFO +- 日志文件:`data/app.log`, `data/crawl.log`, `data/download.log` +- 日志格式:`[时间] [级别] 消息` + +## API 接口 + +### GET / +主页面 + +### POST /load_urls +读取目标URL列表 + +### POST /download_urls +开始抓取画廊链接 + +### POST /download_images +开始下载图片 + +### POST /check_incomplete +检查未完成的下载 + +### POST /clean_files +清理临时文件 + +### POST /clear +清除输出 + +## 注意事项 + +1. **网络要求**: 需要稳定的网络连接和合适的代理 +2. **存储空间**: 确保有足够的磁盘空间存储下载的图片 +3. **合规使用**: 请遵守相关法律法规和网站使用条款 +4. **代理配置**: 建议使用稳定的代理服务以确保下载成功率 + +## 故障排除 + +### 常见问题 + +1. **下载失败**: 检查代理配置和网络连接 +2. **文件损坏**: 重新下载或检查存储空间 +3. **权限错误**: 确保应用有读写权限 +4. **内存不足**: 降低并发数或增加系统内存 + +### 日志查看 + +```bash +# 查看应用日志 +tail -f data/app.log + +# 查看抓取日志 +tail -f data/crawl.log + +# 查看下载日志 +tail -f data/download.log +``` + +## 开发说明 + +### 代码结构 + +- **main.py**: FastAPI 应用主文件 +- **config.py**: 配置管理模块 +- **logger.py**: 日志管理模块 +- **utils.py**: 工具函数模块 +- **step1.py**: 画廊链接抓取逻辑 +- **step2.py**: 图片下载逻辑 + +### 扩展功能 + +1. 添加新的下载源 +2. 支持更多图片格式 +3. 实现下载队列管理 +4. 添加用户认证系统 + +## 许可证 + +本项目仅供学习和研究使用,请遵守相关法律法规。 + +## 更新日志 + +### v1.0.0 +- 初始版本发布 +- 支持基本的画廊下载功能 +- Web界面和API接口 +- Docker支持 \ No newline at end of file diff --git a/build.sh b/build.sh new file mode 100644 index 0000000..d0f920b --- /dev/null +++ b/build.sh @@ -0,0 +1,88 @@ +#!/bin/bash + +# Docker 镜像构建脚本 +# 使用方法: ./build.sh [选项] + +IMAGE_NAME="eh-downloader" +TAG="latest" +DOCKERFILE="Dockerfile" + +# 显示帮助信息 +show_help() { + echo "Docker 镜像构建脚本" + echo "" + echo "使用方法: $0 [选项]" + echo "" + echo "选项:" + echo " -n, --name NAME 镜像名称 (默认: $IMAGE_NAME)" + echo " -t, --tag TAG 镜像标签 (默认: $TAG)" + echo " -f, --file FILE Dockerfile 路径 (默认: $DOCKERFILE)" + echo " --no-cache 构建时不使用缓存" + echo " -h, --help 显示此帮助信息" + echo "" + echo "示例:" + echo " $0 # 使用默认配置构建" + echo " $0 -n myapp -t v1.0 # 指定名称和标签" + echo " $0 --no-cache # 不使用缓存构建" +} + +# 解析命令行参数 +while [[ $# -gt 0 ]]; do + case $1 in + -n|--name) + IMAGE_NAME="$2" + shift 2 + ;; + -t|--tag) + TAG="$2" + shift 2 + ;; + -f|--file) + DOCKERFILE="$2" + shift 2 + ;; + --no-cache) + NO_CACHE="--no-cache" + shift + ;; + -h|--help) + show_help + exit 0 + ;; + *) + echo "❌ 未知选项: $1" + show_help + exit 1 + ;; + esac +done + +# 检查 Dockerfile 是否存在 +if [ ! -f "$DOCKERFILE" ]; then + echo "❌ Dockerfile 不存在: $DOCKERFILE" + exit 1 +fi + +echo "🚀 开始构建 Docker 镜像..." +echo "📦 镜像: $IMAGE_NAME:$TAG" +echo "📄 Dockerfile: $DOCKERFILE" +echo "⏰ 开始时间: $(date)" + +# 执行构建命令 +docker build $NO_CACHE -t $IMAGE_NAME:$TAG -f $DOCKERFILE . + +# 检查构建结果 +if [ $? -eq 0 ]; then + echo "" + echo "✅ 镜像构建成功!" + echo "📊 镜像信息:" + docker images | grep $IMAGE_NAME + + echo "" + echo "🎯 下一步操作:" + echo " 1. 使用 docker-compose up 启动服务" + echo " 2. 或者使用 docker run -p 8000:8000 -v ./data:/app/data $IMAGE_NAME:$TAG 运行容器" +else + echo "❌ 镜像构建失败!" + exit 1 +fi \ No newline at end of file diff --git a/config.py b/config.py new file mode 100644 index 0000000..0ff8ccb --- /dev/null +++ b/config.py @@ -0,0 +1,92 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +配置管理模块 +""" +import os +from pathlib import Path +from typing import List, Optional +from pydantic import BaseModel, Field + + +class AppConfig(BaseModel): + """应用配置""" + # 基础配置 + app_name: str = "EH-Downloader" + app_version: str = "1.0.0" + debug: bool = False + + # 服务器配置 + host: str = "0.0.0.0" + port: int = 8000 + + # 数据目录配置 + data_dir: str = "data" + downloads_dir: str = "data/downloads" + targets_file: str = "data/targets.txt" + proxy_file: str = "data/proxy.txt" + + # 爬虫配置 + concurrency: int = 20 + max_page: int = 100 + retry_per_page: int = 5 + retry_per_image: int = 3 + timeout: float = 10.0 + image_timeout: float = 15.0 + + # 日志配置 + log_level: str = "INFO" + log_format: str = "[%(asctime)s] [%(levelname)s] %(message)s" + + # 文件清理配置 + cleanup_patterns: List[str] = ["**/*.log", "**/*.json"] + cleanup_exclude: List[str] = ["data/targets.txt"] + + def __init__(self, **kwargs): + super().__init__(**kwargs) + # 确保目录存在 + self._ensure_directories() + + def _ensure_directories(self): + """确保必要的目录存在""" + Path(self.data_dir).mkdir(exist_ok=True) + Path(self.downloads_dir).mkdir(parents=True, exist_ok=True) + + @property + def targets_path(self) -> Path: + """获取targets文件路径""" + return Path(self.targets_file) + + @property + def proxy_path(self) -> Path: + """获取proxy文件路径""" + return Path(self.proxy_file) + + def get_proxies(self) -> List[str]: + """读取代理列表""" + if not self.proxy_path.exists(): + return ["127.0.0.1:7890"] + + try: + with open(self.proxy_path, 'r', encoding='utf-8') as f: + proxies = [line.strip() for line in f.readlines() if line.strip()] + return proxies if proxies else ["127.0.0.1:7890"] + except Exception: + return ["127.0.0.1:7890"] + + def get_targets(self) -> List[str]: + """读取目标URL列表""" + if not self.targets_path.exists(): + return [] + + try: + with open(self.targets_path, 'r', encoding='utf-8') as f: + urls = [line.strip() for line in f.readlines() if line.strip()] + # 过滤掉注释行 + return [url for url in urls if url and not url.startswith('#')] + except Exception: + return [] + + +# 全局配置实例 +config = AppConfig() diff --git a/docker-compose.yaml b/docker-compose.yaml new file mode 100644 index 0000000..f9abfbb --- /dev/null +++ b/docker-compose.yaml @@ -0,0 +1,11 @@ +services: + eh-downloader: + image: eh-downloader:latest + container_name: eh-downloader + ports: + - "58001:8000" + volumes: + - ./data:/app/data + restart: unless-stopped + environment: + - PYTHONUNBUFFERED=1 \ No newline at end of file diff --git a/downloader.py b/downloader.py new file mode 100644 index 0000000..69e6508 --- /dev/null +++ b/downloader.py @@ -0,0 +1,62 @@ +import aiofiles +import httpx +from typing import Optional +import os + +class Downloader: + def __init__(self): + self.output_dir = "downloads" + os.makedirs(self.output_dir, exist_ok=True) + + async def download(self, proxy_str: str, url: str) -> str: + """ + 下载文件的主要逻辑 + """ + try: + # 如果proxy_str不为空,构建代理配置 + proxy = None + if proxy_str and ":" in proxy_str: + ip, port = proxy_str.split(":", 1) + proxy = f"http://{ip}:{port}" + + # 使用 httpx 异步下载 + async with httpx.AsyncClient(proxies=proxy, timeout=30.0) as client: + response = await client.get(url) + response.raise_for_status() + + # 获取文件名 + filename = self._get_filename(url, response) + filepath = os.path.join(self.output_dir, filename) + + # 保存文件 + async with aiofiles.open(filepath, 'wb') as f: + await f.write(response.content) + + return f"下载成功: {filename}\n保存路径: {filepath}\n文件大小: {len(response.content)} bytes" + + except Exception as e: + raise Exception(f"下载过程中出错: {str(e)}") + + def _get_filename(self, url: str, response: httpx.Response) -> str: + """从 URL 或响应头中获取文件名""" + # 从 URL 中提取文件名 + if '/' in url: + filename = url.split('/')[-1] + if '?' in filename: + filename = filename.split('?')[0] + else: + filename = "downloaded_file" + + # 如果没有扩展名,尝试从 Content-Type 推断 + if '.' not in filename: + content_type = response.headers.get('content-type', '') + if 'image' in content_type: + ext = content_type.split('/')[-1] + filename = f"{filename}.{ext}" + + return filename or "downloaded_file" + + async def download_image(self, proxy_str: str, url: str) -> str: + """专门下载图片的方法""" + # 这里可以添加图片下载的特殊逻辑 + return await self.download(proxy_str, url) \ No newline at end of file diff --git a/logger.py b/logger.py new file mode 100644 index 0000000..7b53c86 --- /dev/null +++ b/logger.py @@ -0,0 +1,88 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +日志管理模块 +""" +import logging +import sys +from pathlib import Path +from typing import Optional + +from config import config + + +class LoggerManager: + """日志管理器""" + + _loggers = {} + + @classmethod + def get_logger(cls, name: str, log_file: Optional[str] = None) -> logging.Logger: + """获取日志记录器""" + if name in cls._loggers: + return cls._loggers[name] + + logger = logging.getLogger(name) + logger.setLevel(getattr(logging, config.log_level.upper())) + + # 避免重复添加处理器 + if logger.handlers: + return logger + + # 控制台处理器 + console_handler = logging.StreamHandler(sys.stdout) + console_handler.setLevel(getattr(logging, config.log_level.upper())) + console_formatter = logging.Formatter(config.log_format) + console_handler.setFormatter(console_formatter) + logger.addHandler(console_handler) + + # 文件处理器 + if log_file: + log_path = Path(config.data_dir) / log_file + file_handler = logging.FileHandler(log_path, encoding='utf-8') + file_handler.setLevel(getattr(logging, config.log_level.upper())) + file_formatter = logging.Formatter(config.log_format) + file_handler.setFormatter(file_formatter) + logger.addHandler(file_handler) + + # WebSocket 实时日志处理器 + logger.addHandler(WebSocketLogHandler()) + + cls._loggers[name] = logger + return logger + + @classmethod + def setup_root_logger(cls): + """设置根日志记录器""" + logging.basicConfig( + level=getattr(logging, config.log_level.upper()), + format=config.log_format, + handlers=[ + logging.StreamHandler(sys.stdout), + logging.FileHandler(Path(config.data_dir) / "app.log", encoding='utf-8'), + WebSocketLogHandler(), + ] + ) + + +# 便捷函数 +def get_logger(name: str, log_file: Optional[str] = None) -> logging.Logger: + """获取日志记录器的便捷函数""" + return LoggerManager.get_logger(name, log_file) + + +class WebSocketLogHandler(logging.Handler): + """将日志通过实时日志器广播到 WebSocket 客户端""" + + def emit(self, record: logging.LogRecord) -> None: + try: + message = self.format(record) + level = record.levelname + source = record.name + # 走同步接口,内部会尝试调度到事件循环 + # 延迟导入,避免循环依赖 + from realtime_logger import realtime_logger + realtime_logger.broadcast_log_sync(message, level, source) + except Exception: + # 保证日志不因 WebSocket 发送失败而中断 + pass diff --git a/main.py b/main.py new file mode 100644 index 0000000..f61827a --- /dev/null +++ b/main.py @@ -0,0 +1,305 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +EH-Downloader 主应用 +""" +import glob +import os +from pathlib import Path +from typing import List + +from fastapi import FastAPI, Request, HTTPException, WebSocket, WebSocketDisconnect +from fastapi.staticfiles import StaticFiles +from fastapi.templating import Jinja2Templates +from fastapi.responses import JSONResponse, FileResponse +from pydantic import BaseModel +import uvicorn +import asyncio +import threading +import json + +from config import config +from logger import get_logger +from realtime_logger import realtime_logger +import step2 +from utils import run_step1, run_step2 + +# 设置日志 +logger = get_logger("main", "app.log") + +app = FastAPI( + title=config.app_name, + version=config.app_version, + description="E-Hentai 画廊下载工具" +) + +@app.on_event("startup") +async def startup_event(): + """应用启动事件""" + logger.info(f"启动 {config.app_name} v{config.app_version}") + # 注册事件循环到实时日志器,便于跨线程广播 + try: + realtime_logger.set_loop(asyncio.get_running_loop()) + except RuntimeError: + # 若获取失败则忽略 + pass + + # 确保目录存在 + config._ensure_directories() + + # 创建默认targets.txt文件 + if not config.targets_path.exists(): + with open(config.targets_path, 'w', encoding='utf-8') as f: + f.write("# 在这里添加目标URL,每行一个\n") + f.write("# 示例:\n") + f.write("https://e-hentai.org/g/3550066/47d6393550\n") + logger.info(f"创建文件: {config.targets_path}") + + # 创建默认proxy.txt文件 + if not config.proxy_path.exists(): + with open(config.proxy_path, 'w', encoding='utf-8') as f: + f.write("127.0.0.1:7890\n") + logger.info(f"创建文件: {config.proxy_path}") + + logger.info("应用启动完成") + +# 挂载静态文件和模板 +app.mount("/static", StaticFiles(directory="static"), name="static") +templates = Jinja2Templates(directory="templates") + +# WebSocket 路由 +@app.websocket("/ws") +async def websocket_endpoint(websocket: WebSocket): + """WebSocket连接处理""" + await websocket.accept() + realtime_logger.add_connection(websocket) + + try: + # 发送最近的日志 + recent_logs = await realtime_logger.get_recent_logs(20) + for log_entry in recent_logs: + await websocket.send_text(json.dumps(log_entry, ensure_ascii=False)) + + # 保持连接 + while True: + try: + # 等待客户端消息(心跳检测) + data = await websocket.receive_text() + if data == "ping": + await websocket.send_text("pong") + except WebSocketDisconnect: + break + except Exception as e: + logger.error(f"WebSocket错误: {e}") + finally: + realtime_logger.remove_connection(websocket) + +# favicon 路由 +@app.get("/favicon.ico", include_in_schema=False) +async def favicon(): + return FileResponse("static/favicon.ico") + +@app.get("/") +async def home(request: Request): + """主页面""" + try: + proxies = config.get_proxies() + return templates.TemplateResponse("index.html", { + "request": request, + "proxies": proxies, + "default_proxy": proxies[0] if proxies else "127.0.0.1:7890" + }) + except Exception as e: + logger.error(f"渲染主页失败: {e}") + raise HTTPException(status_code=500, detail="服务器内部错误") + +@app.post("/load_urls") +async def load_urls(): + """读取 targets.txt 文件中的URL""" + try: + urls = config.get_targets() + + if not urls: + return JSONResponse({ + "success": True, + "message": "targets.txt 文件为空,请在data/targets.txt中添加URL", + "urls": [] + }) + + logger.info(f"成功读取 {len(urls)} 个URL") + return JSONResponse({ + "success": True, + "message": f"成功读取 {len(urls)} 个URL", + "urls": urls + }) + + except Exception as e: + logger.error(f"读取URL失败: {e}") + return JSONResponse({ + "success": False, + "message": f"读取文件时出错: {str(e)}", + "urls": [] + }) + +@app.post("/clear") +async def clear_output(): + """清除输出""" + return JSONResponse({ + "success": True, + "message": "输出已清除", + "output": "" + }) + +class ProxyRequest(BaseModel): + proxy: str # 修改为单个proxy字段 + +@app.post("/download_urls") +async def download_urls(req: ProxyRequest): + """下载画廊链接""" + try: + # 解析proxy字符串为ip和port + if ":" in req.proxy: + ip, port = req.proxy.split(":", 1) + proxy = f"http://{ip}:{port}" + else: + proxy = None + + # 发送实时日志 + await realtime_logger.broadcast_log(f"开始抓取画廊链接,代理: {proxy}", "INFO", "step1") + + # 在后台线程中执行,避免阻塞 + def run_step1_sync(): + import asyncio + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + try: + return loop.run_until_complete(run_step1(proxy)) + finally: + loop.close() + + # 使用线程池执行 + import concurrent.futures + with concurrent.futures.ThreadPoolExecutor() as executor: + future = executor.submit(run_step1_sync) + msg = future.result() + + await realtime_logger.broadcast_log(f"画廊链接抓取完成: {msg}", "SUCCESS", "step1") + return JSONResponse({"success": True, "message": msg}) + except Exception as e: + await realtime_logger.broadcast_log(f"抓取画廊链接失败: {e}", "ERROR", "step1") + logger.error(f"抓取画廊链接失败: {e}") + return JSONResponse({"success": False, "message": f"抓取失败: {str(e)}"}) + +@app.post("/download_images") +async def download_images(req: ProxyRequest): + """下载图片""" + try: + # 解析proxy字符串为ip和port + if ":" in req.proxy: + ip, port = req.proxy.split(":", 1) + proxy = f"http://{ip}:{port}" + else: + proxy = None + + # 发送实时日志 + await realtime_logger.broadcast_log(f"开始下载图片,代理: {proxy}", "INFO", "step2") + + # 在后台线程中执行,避免阻塞 + def run_step2_sync(): + import asyncio + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + try: + return loop.run_until_complete(run_step2(proxy)) + finally: + loop.close() + + # 使用线程池执行 + import concurrent.futures + with concurrent.futures.ThreadPoolExecutor() as executor: + future = executor.submit(run_step2_sync) + msg = future.result() + + await realtime_logger.broadcast_log(f"图片下载完成: {msg}", "SUCCESS", "step2") + return JSONResponse({"success": True, "message": msg}) + except Exception as e: + await realtime_logger.broadcast_log(f"下载图片失败: {e}", "ERROR", "step2") + logger.error(f"下载图片失败: {e}") + return JSONResponse({"success": False, "message": f"下载失败: {str(e)}"}) + +@app.post("/clean_files") +async def clean_files(): + """清理项目目录下的所有 .log 和 .json 文件""" + try: + deleted_files = [] + error_files = [] + + # 使用配置中的清理模式 + for pattern in config.cleanup_patterns: + for file_path in glob.glob(pattern, recursive=True): + try: + # 跳过排除的文件 + if file_path in config.cleanup_exclude: + continue + + os.remove(file_path) + deleted_files.append(file_path) + logger.info(f"已删除文件: {file_path}") + except Exception as e: + error_files.append(f"{file_path}: {str(e)}") + logger.error(f"删除文件失败 {file_path}: {str(e)}") + + if error_files: + logger.warning(f"清理完成,但部分文件删除失败: {len(error_files)} 个") + return JSONResponse({ + "success": False, + "message": f"清理完成,但部分文件删除失败", + "deleted_count": len(deleted_files), + "error_count": len(error_files), + "deleted_files": deleted_files, + "error_files": error_files + }) + else: + logger.info(f"成功清理 {len(deleted_files)} 个文件") + return JSONResponse({ + "success": True, + "message": f"成功清理 {len(deleted_files)} 个文件", + "deleted_count": len(deleted_files), + "error_count": 0, + "deleted_files": deleted_files + }) + + except Exception as e: + logger.error(f"清理过程中出错: {e}") + return JSONResponse({ + "success": False, + "message": f"清理过程中出错: {str(e)}", + "deleted_count": 0, + "error_count": 0 + }) + +@app.post("/check_incomplete") +async def check_incomplete(): + """检查未完成文件""" + try: + result = await step2.scan_tasks() + logger.info(f"检查未完成文件: {len(result)} 个") + return JSONResponse({ + "success": True, + "message": "检查未完成文件功能已就绪", + "data": f"共 {len(result)} 个文件未下载" + }) + except Exception as e: + logger.error(f"检查未完成文件失败: {e}") + return JSONResponse({ + "success": False, + "message": f"检查失败: {str(e)}" + }) + +if __name__ == "__main__": + uvicorn.run( + "main:app", + host=config.host, + port=config.port, + reload=config.debug + ) \ No newline at end of file diff --git a/performance.py b/performance.py new file mode 100644 index 0000000..cf7fb34 --- /dev/null +++ b/performance.py @@ -0,0 +1,81 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +性能优化模块 +""" +import asyncio +import time +from typing import Dict, Any +from functools import wraps + +from logger import get_logger + +logger = get_logger("performance") + + +def monitor_performance(func): + """性能监控装饰器""" + @wraps(func) + async def async_wrapper(*args, **kwargs): + start_time = time.time() + try: + result = await func(*args, **kwargs) + execution_time = time.time() - start_time + logger.info(f"{func.__name__} 执行完成,耗时: {execution_time:.2f}秒") + return result + except Exception as e: + execution_time = time.time() - start_time + logger.error(f"{func.__name__} 执行失败,耗时: {execution_time:.2f}秒,错误: {e}") + raise + + @wraps(func) + def sync_wrapper(*args, **kwargs): + start_time = time.time() + try: + result = func(*args, **kwargs) + execution_time = time.time() - start_time + logger.info(f"{func.__name__} 执行完成,耗时: {execution_time:.2f}秒") + return result + except Exception as e: + execution_time = time.time() - start_time + logger.error(f"{func.__name__} 执行失败,耗时: {execution_time:.2f}秒,错误: {e}") + raise + + if asyncio.iscoroutinefunction(func): + return async_wrapper + else: + return sync_wrapper + + +class PerformanceMonitor: + """性能监控器""" + + def __init__(self): + self.metrics: Dict[str, Any] = {} + self.start_time = time.time() + + def start_timer(self, name: str): + """开始计时""" + self.metrics[name] = {"start": time.time()} + + def end_timer(self, name: str): + """结束计时""" + if name in self.metrics: + self.metrics[name]["end"] = time.time() + self.metrics[name]["duration"] = ( + self.metrics[name]["end"] - self.metrics[name]["start"] + ) + logger.info(f"{name} 耗时: {self.metrics[name]['duration']:.2f}秒") + + def get_summary(self) -> Dict[str, Any]: + """获取性能摘要""" + total_time = time.time() - self.start_time + return { + "total_time": total_time, + "metrics": self.metrics, + "summary": f"总运行时间: {total_time:.2f}秒" + } + + +# 全局性能监控器 +perf_monitor = PerformanceMonitor() diff --git a/realtime_logger.py b/realtime_logger.py new file mode 100644 index 0000000..40f1d1b --- /dev/null +++ b/realtime_logger.py @@ -0,0 +1,117 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +实时日志输出模块 +""" +import asyncio +import json +import time +import threading +from typing import List, Dict, Any, Optional +from pathlib import Path + +import logging + +logger = logging.getLogger("realtime_logger") + + +class RealtimeLogger: + """实时日志记录器""" + + def __init__(self): + self.connections: List[Any] = [] + self.log_buffer: List[Dict[str, Any]] = [] + self.max_buffer_size = 1000 + self._lock = threading.Lock() + self._loop: Optional[asyncio.AbstractEventLoop] = None + + def set_loop(self, loop: asyncio.AbstractEventLoop) -> None: + """注册主事件循环,便于跨线程安全调度发送任务""" + self._loop = loop + + def add_connection(self, websocket): + """添加WebSocket连接""" + with self._lock: + self.connections.append(websocket) + logger.info(f"新增WebSocket连接,当前连接数: {len(self.connections)}") + + def remove_connection(self, websocket): + """移除WebSocket连接""" + with self._lock: + if websocket in self.connections: + self.connections.remove(websocket) + logger.info(f"移除WebSocket连接,当前连接数: {len(self.connections)}") + + async def broadcast_log(self, message: str, level: str = "INFO", source: str = "system"): + """广播日志消息到所有连接的客户端""" + log_entry = { + "timestamp": time.time(), + "time": time.strftime("%H:%M:%S"), + "level": level, + "source": source, + "message": message + } + + # 添加到缓冲区 + with self._lock: + self.log_buffer.append(log_entry) + if len(self.log_buffer) > self.max_buffer_size: + self.log_buffer = self.log_buffer[-self.max_buffer_size:] + + # 广播到所有连接 + if self.connections: + message_data = json.dumps(log_entry, ensure_ascii=False) + disconnected = [] + + for websocket in self.connections.copy(): # 使用副本避免并发修改 + try: + await websocket.send_text(message_data) + except Exception as e: + logger.warning(f"发送消息失败: {e}") + disconnected.append(websocket) + + # 清理断开的连接 + for ws in disconnected: + self.remove_connection(ws) + + def broadcast_log_sync(self, message: str, level: str = "INFO", source: str = "system"): + """同步广播日志消息(用于非异步环境)""" + log_entry = { + "timestamp": time.time(), + "time": time.strftime("%H:%M:%S"), + "level": level, + "source": source, + "message": message + } + + # 添加到缓冲区 + with self._lock: + self.log_buffer.append(log_entry) + if len(self.log_buffer) > self.max_buffer_size: + self.log_buffer = self.log_buffer[-self.max_buffer_size:] + + # 若已注册事件循环,尝试在线程安全地调度异步广播 + if self._loop is not None: + try: + asyncio.run_coroutine_threadsafe( + self.broadcast_log(message=message, level=level, source=source), + self._loop, + ) + except Exception: + # 忽略发送失败,缓冲区仍可用于新连接回放 + pass + + async def get_recent_logs(self, count: int = 50) -> List[Dict[str, Any]]: + """获取最近的日志""" + with self._lock: + return self.log_buffer[-count:] if self.log_buffer else [] + + def clear_buffer(self): + """清空日志缓冲区""" + with self._lock: + self.log_buffer.clear() + logger.info("日志缓冲区已清空") + + +# 全局实时日志记录器 +realtime_logger = RealtimeLogger() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..d5f0cfd --- /dev/null +++ b/requirements.txt @@ -0,0 +1,32 @@ +# Web框架 +fastapi==0.104.1 +uvicorn[standard]==0.24.0 +starlette==0.27.0 +websockets==12.0 + +# HTTP客户端 +httpx==0.25.2 +httpcore==1.0.9 + +# 异步文件操作 +aiofiles==24.1.0 + +# HTML解析 +beautifulsoup4==4.14.0 +lxml==6.0.2 +soupsieve==2.8 + +# 模板引擎 +Jinja2==3.1.6 +MarkupSafe==3.0.3 + +# 数据验证 +pydantic==2.11.9 +pydantic_core==2.33.2 + +# 进度条 +tqdm==4.67.1 + +# 其他依赖 +python-multipart==0.0.6 +certifi==2025.8.3 diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..6f6f872 --- /dev/null +++ b/src/__init__.py @@ -0,0 +1,2 @@ +# src package initialization +# This file makes Python treat the directory as a package diff --git a/src/core/__init__.py b/src/core/__init__.py new file mode 100644 index 0000000..204ed63 --- /dev/null +++ b/src/core/__init__.py @@ -0,0 +1,2 @@ +# core package initialization +# Contains core business logic diff --git a/src/core/step1.py b/src/core/step1.py new file mode 100644 index 0000000..2b7c221 --- /dev/null +++ b/src/core/step1.py @@ -0,0 +1,83 @@ +from __future__ import annotations + +import asyncio +import json +import logging +import os +import re +import sys +from pathlib import Path +from typing import Dict, List, Optional + +import httpx +from bs4 import BeautifulSoup +from tqdm.asyncio import tqdm_asyncio + +from config import config +from ..logging.logger import get_logger +from ..logging.realtime_logger import realtime_logger + +log = get_logger("step1") + +ILLEGAL_CHARS = re.compile(r'[\\/:*?"<>|]') + +def clean_folder_name(name: str) -> str: + """清理文件夹名称中的非法字符""" + return ILLEGAL_CHARS.sub('', name).strip() + +async def fetch_page(url: str, client: httpx.AsyncClient) -> Optional[str]: + """获取页面内容""" + try: + resp = await client.get(url) + resp.raise_for_status() + return resp.text + except httpx.HTTPError as e: + log.error(f"Failed to fetch {url}: {str(e)}") + return None + +async def crawl_single_gallery(gallery_url: str, client: httpx.AsyncClient, sem: asyncio.Semaphore): + """爬取单个画廊""" + async with sem: + try: + gallery_url = gallery_url.rstrip() + base_url = gallery_url.split('/g/')[0] + folder_name = clean_folder_name(gallery_url.split('/')[-1]) + folder_path = Path(config.targets_path) / folder_name + json_path = folder_path / "info.json" + + if json_path.exists(): + log.info(f"Skipping existing gallery: {gallery_url}") + return + + os.makedirs(folder_path, exist_ok=True) + + log.info(f"Processing gallery: {gallery_url}") + page_content = await fetch_page(gallery_url, client) + if not page_content: + return + + soup = BeautifulSoup(page_content, 'html.parser') + json_data = { + "gallery_url": gallery_url, + "base_url": base_url, + "title": soup.select_one('h1#gn').text.strip(), + "images": [] + } + + # 提取图片信息 + # ... (省略具体实现) + + json_path.write_text(json.dumps(json_data, indent=2, ensure_ascii=False)) + realtime_logger.broadcast_log_sync(f"Processed gallery: {gallery_url}", "INFO", "step1") + + except Exception as e: + log.error(f"Error processing {gallery_url}: {str(e)}") + realtime_logger.broadcast_log_sync(f"Error in {gallery_url}: {str(e)}", "ERROR", "step1") + raise + +async def main(proxy: Optional[str] = None): + """主函数""" + # ... (省略具体实现) + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/src/core/step2.py b/src/core/step2.py new file mode 100644 index 0000000..635deaf --- /dev/null +++ b/src/core/step2.py @@ -0,0 +1,63 @@ +from __future__ import annotations + +import asyncio +import json +import logging +import os +import re +import sys +from pathlib import Path +from typing import Dict, List + +import aiofiles +import httpx +from tqdm.asyncio import tqdm_asyncio + +from config import config +from ..logging.logger import get_logger +from ..logging.realtime_logger import realtime_logger + +log = get_logger("step2") + +async def download_one(url: str, client: httpx.AsyncClient, sem: asyncio.Semaphore): + """下载单个图片""" + async with sem: + try: + resp = await client.get(url) + resp.raise_for_status() + + # 提取真实图片URL和扩展名 + # ... (省略具体实现) + + async with aiofiles.open(final_path, 'wb') as fp: + async for chunk in resp.aiter_bytes(): + await fp.write(chunk) + + realtime_logger.broadcast_log_sync(f"Downloaded {url}", "INFO", "step2") + return True + except Exception as e: + log.error(f"Failed to download {url}: {str(e)}") + realtime_logger.broadcast_log_sync(f"Error downloading {url}: {str(e)}", "ERROR", "step2") + return False + +async def scan_tasks(root: Path) -> List[Dict[str, str]]: + """扫描任务目录""" + tasks = [] + for json_path in root.rglob("info.json"): + try: + data = json.loads(json_path.read_text()) + for url, info in data["images"].items(): + tasks.append({ + "url": info["real_url"], + "save_path": json_path.parent / info["filename"] + }) + except Exception as e: + log.warning(f"Error reading {json_path}: {str(e)}") + return tasks + +async def main(proxy: Optional[str] = None): + """主函数""" + # ... (省略具体实现) + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/src/logging/__init__.py b/src/logging/__init__.py new file mode 100644 index 0000000..195de12 --- /dev/null +++ b/src/logging/__init__.py @@ -0,0 +1,2 @@ +# logging package initialization +# Contains logging and real-time logging functionality diff --git a/src/logging/logger.py b/src/logging/logger.py new file mode 100644 index 0000000..e33bb3c --- /dev/null +++ b/src/logging/logger.py @@ -0,0 +1,59 @@ +import logging +import sys +from pathlib import Path +from typing import Optional + +import threading +from config import config +# 延迟导入,避免循环依赖 +from .realtime_logger import realtime_logger + +class LoggerManager: + @staticmethod + def setup_root_logger(log_file: Optional[str] = None, level: str = "INFO"): + logger = logging.getLogger() + logger.setLevel(level.upper()) + + console_handler = logging.StreamHandler(sys.stdout) + console_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')) + logger.addHandler(console_handler) + + if log_file: + file_handler = logging.FileHandler(log_file) + file_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')) + logger.addHandler(file_handler) + + @staticmethod + def get_logger(name: str, log_file: Optional[str] = None, level: str = "INFO"): + logger = logging.getLogger(name) + logger.setLevel(level.upper()) + + if not logger.handlers: + console_handler = logging.StreamHandler(sys.stdout) + console_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')) + logger.addHandler(console_handler) + + if log_file: + file_handler = logging.FileHandler(log_file) + file_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')) + logger.addHandler(file_handler) + + return logger + +class WebSocketLogHandler(logging.Handler): + def __init__(self): + super().__init__() + self.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')) + + def emit(self, record): + try: + msg = self.format(record) + realtime_logger.broadcast_log_sync(msg, record.levelname, record.name) + except Exception: + self.handleError(record) + +def get_logger(name: str, log_file: Optional[str] = None, level: str = "INFO"): + return LoggerManager.get_logger(name, log_file, level) + +def setup_root_logger(log_file: Optional[str] = None, level: str = "INFO"): + LoggerManager.setup_root_logger(log_file, level) diff --git a/src/logging/realtime_logger.py b/src/logging/realtime_logger.py new file mode 100644 index 0000000..1b33f47 --- /dev/null +++ b/src/logging/realtime_logger.py @@ -0,0 +1,77 @@ +import asyncio +import json +import time +import threading +from typing import List, Dict, Any, Optional +from pathlib import Path +import logging + +from config import config + +class RealtimeLogger: + _instance = None + _lock = threading.Lock() + + def __init__(self): + self.logger = logging.getLogger("realtime_logger") + self._loop = None + self._connections: List[Dict[str, Any]] = [] + self._log_buffer: List[Dict[str, Any]] = [] + self._buffer_lock = threading.Lock() + self._max_buffer_size = 100 + + def set_loop(self, loop): + self._loop = loop + + def add_connection(self, websocket): + with self._buffer_lock: + self._connections.append({"websocket": websocket, "last_active": time.time()}) + + def remove_connection(self, websocket): + with self._buffer_lock: + self._connections = [conn for conn in self._connections if conn["websocket"] != websocket] + + async def broadcast_log(self, message: str, level: str = "INFO", source: str = "system"): + log_entry = { + "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"), + "level": level, + "source": source, + "message": message + } + + with self._buffer_lock: + self._log_buffer.append(log_entry) + if len(self._log_buffer) > self._max_buffer_size: + self._log_buffer.pop(0) + + disconnected = [] + for connection in self._connections: + try: + await connection["websocket"].send_text(json.dumps(log_entry)) + connection["last_active"] = time.time() + except Exception as e: + self.logger.warning(f"Failed to send log to websocket: {e}") + disconnected.append(connection["websocket"]) + + for ws in disconnected: + self.remove_connection(ws) + + def broadcast_log_sync(self, message: str, level: str = "INFO", source: str = "system"): + if self._loop is None: + self.logger.warning("Event loop not set for RealtimeLogger") + return + + asyncio.run_coroutine_threadsafe( + self.broadcast_log(message, level, source), + self._loop + ) + + def get_recent_logs(self, count: int = 10) -> List[Dict[str, Any]]: + with self._buffer_lock: + return self._log_buffer[-count:].copy() + + def clear_buffer(self): + with self._buffer_lock: + self._log_buffer.clear() + +realtime_logger = RealtimeLogger() diff --git a/src/services/__init__.py b/src/services/__init__.py new file mode 100644 index 0000000..2e41ab5 --- /dev/null +++ b/src/services/__init__.py @@ -0,0 +1,2 @@ +# services package initialization +# Contains service classes and components diff --git a/src/services/downloader.py b/src/services/downloader.py new file mode 100644 index 0000000..c8a4af3 --- /dev/null +++ b/src/services/downloader.py @@ -0,0 +1,43 @@ +import aiofiles +import httpx +from typing import Optional +import os + +from config import config + +class Downloader: + def __init__(self, proxy_str: Optional[str] = None): + self.proxy = proxy_str + + async def download(self, url: str, save_path: str): + os.makedirs(os.path.dirname(save_path), exist_ok=True) + + async with httpx.AsyncClient(proxies=self.proxy) as client: + response = await client.get(url) + response.raise_for_status() + + async with aiofiles.open(save_path, 'wb') as f: + await f.write(response.content) + + def _get_filename(self, url: str, content_type: Optional[str] = None) -> str: + if not url: + raise Exception("URL cannot be empty") + + filename = url.split('/')[-1] + if content_type: + ext = content_type.split('/')[-1] + if '.' not in filename: + filename = f"{filename}.{ext}" + + return filename + + async def download_image(self, url: str, save_dir: str): + async with httpx.AsyncClient(proxies=self.proxy) as client: + response = await client.get(url) + response.raise_for_status() + + filename = self._get_filename(url, response.headers.get('content-type')) + save_path = os.path.join(save_dir, filename) + + await self.download(url, save_path) + return save_path diff --git a/src/utils/__init__.py b/src/utils/__init__.py new file mode 100644 index 0000000..86304b6 --- /dev/null +++ b/src/utils/__init__.py @@ -0,0 +1,2 @@ +# utils package initialization +# Contains utility functions and helpers diff --git a/src/utils/performance.py b/src/utils/performance.py new file mode 100644 index 0000000..9490228 --- /dev/null +++ b/src/utils/performance.py @@ -0,0 +1,57 @@ +import asyncio +import time +from typing import Dict, Any +from functools import wraps + +from ..logging.logger import get_logger + +class PerformanceMonitor: + def __init__(self): + self.logger = get_logger("performance") + self._timers: Dict[str, Dict[str, Any]] = {} + + def monitor_performance(self, func): + @wraps(func) + async def async_wrapper(*args, **kwargs): + start_time = time.time() + self.logger.info(f"Starting {func.__name__}") + try: + result = await func(*args, **kwargs) + elapsed = time.time() - start_time + self.logger.info(f"Completed {func.__name__} in {elapsed:.2f}s") + return result + except Exception as e: + self.logger.error(f"Error in {func.__name__}: {str(e)}") + raise + + @wraps(func) + def sync_wrapper(*args, **kwargs): + start_time = time.time() + self.logger.info(f"Starting {func.__name__}") + try: + result = func(*args, **kwargs) + elapsed = time.time() - start_time + self.logger.info(f"Completed {func.__name__} in {elapsed:.2f}s") + return result + except Exception as e: + self.logger.error(f"Error in {func.__name__}: {str(e)}") + raise + + if asyncio.iscoroutinefunction(func): + return async_wrapper + return sync_wrapper + + def start_timer(self, name: str): + self._timers[name] = {"start": time.time()} + + def end_timer(self, name: str) -> float: + if name not in self._timers: + raise KeyError(f"No timer with name {name}") + + elapsed = time.time() - self._timers[name]["start"] + self._timers[name]["end"] = time.time() + self._timers[name]["elapsed"] = elapsed + return elapsed + + def get_summary(self) -> Dict[str, float]: + return {name: data["elapsed"] for name, data in self._timers.items() if "elapsed" in data} diff --git a/src/utils/utils.py b/src/utils/utils.py new file mode 100644 index 0000000..3d5d444 --- /dev/null +++ b/src/utils/utils.py @@ -0,0 +1,25 @@ +from typing import Optional + +from ..logging.logger import get_logger +from ..core.step1 import main as step1_main +from ..core.step2 import main as step2_main + +log = get_logger("utils") + +async def run_step1(proxy: Optional[str] = None): + try: + log.info("Running step1") + await step1_main(proxy) + log.info("Step1 completed successfully") + except Exception as e: + log.exception(f"Error in step1: {str(e)}") + raise + +async def run_step2(proxy: Optional[str] = None): + try: + log.info("Running step2") + await step2_main(proxy) + log.info("Step2 completed successfully") + except Exception as e: + log.exception(f"Error in step2: {str(e)}") + raise diff --git a/start.py b/start.py new file mode 100644 index 0000000..05ee31c --- /dev/null +++ b/start.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +应用启动脚本 +""" +import sys +import os +from pathlib import Path + +# 添加项目根目录到Python路径 +project_root = Path(__file__).parent +sys.path.insert(0, str(project_root)) + +from config import config +from logger import LoggerManager +import uvicorn + +def main(): + """主函数""" + # 设置根日志记录器 + LoggerManager.setup_root_logger() + + # 确保数据目录存在 + config._ensure_directories() + + print(f"启动 {config.app_name} v{config.app_version}") + print(f"服务器地址: http://{config.host}:{config.port}") + print(f"调试模式: {'开启' if config.debug else '关闭'}") + + # 启动服务器 + uvicorn.run( + "main:app", + host=config.host, + port=config.port, + reload=config.debug, + log_level=config.log_level.lower() + ) + +if __name__ == "__main__": + main() diff --git a/static/favicon.ico b/static/favicon.ico new file mode 100644 index 0000000000000000000000000000000000000000..95375830c6408f7e434d9aeab4180cde76a9e6bd GIT binary patch literal 4286 zcmcgwX>XL*6`p?R_e83yeyC^!X-nynNL^B;shV`9QlTOQLkKudfyR(R2umOoLx3oY z9GbAWtoH1SJu}AR!DeiXiT8a48{^#&gALfsj6M7E^f`AtYLy>QTgUf(?{c1Vo^$Sf z&$UMLHU2*Jlt%t)Y)@!3-_&R{ODXKqyhxFL^~15R{)FN&1YOzgMu z-NM$ucUwqWzAJn_b6zWLx341q7?G^64yJ}iVY@mEk2tm zrUhGsBjH3RHZ15U=b9?oA?uY7w4;`>tft%~H$HMdM*MC*Te-%_wPs4X6`kV0*khuo zXPLJ!S8O#ALu`=vD_%HHPaP5$vBOQwOT@QQ^z0Lx?d%h~?lIRSUS-TJ@lDnVrjoQB ze7B_g*qQ}L$$#fcoCHter{|c`qxzD_ryj9e=BRmUP7)=YKh8Y45>29*_L#*Fj)`uA zpRJ#^@>#?p!u9H%>%>p9nw#Q6o<;O?3q0^yxRE>+pYvsmA{Z8m#6tLJWo}v+ixh_( z55%Dl#$jYGNS>KFM{z-WQnsahka&uI$v-tNy63qr2qpTtxEhx@rQ(->mg73=)Co5I zwUTo;af;choaVTG4#=MbE{+1{r-RVW&a*E&U->d18zJp3ZW3-d}_9`rRnCf({Ic45bW1iW7n_=yE;m6c4``h zVZcrsBxei>z~COix;OIhWFGKBeGtoU+`#kQeOTH*jm5Y9_!;A}mA=2qjyOvEtIf9uRNjF+Fox}HP$N8?4D8oLcMZ8nj2 zosYRf{|)Y5{At};Xm$~&U=$UD9ymrl(DfJMc*l7h>nX&An^v3~a509QFW8K~@T~CL zJt%wgyBIb<4M)LlT)aJ~@Fh+*?<|a-<#?@S8@9KaaB8#xgJ>jYBCf4mOfF=ic?er*YlT$E&@gp|{ z950{^M&_u>E8@yoBcga9zGt8Y!RU)~eARC!E_#p6ygF`rxcO0knc_=8+!qQd=>+~egh18xG-qjf^ zE%_7sONW|Kw)fZQKK~@@i(Z1;`Z6xLx1z-LcU09Jg!@Vs))X(o+M4(9*&zAn8jI{j z$v@g+s2jm68!zDLSAcc2Va@IcGWOob-}d$3P;LXvrM)oRC10|4)45M>jl|bcSMf+E z+=o`9x8NISuULlK>L0@6`2n1lm%v&29XPN59!IL4MdsDlaqLC|9Q4`B^FYp79r0~- zUm_#p3^cz2)@I+wp6qFC*#^9^1IVO3d(JWz&BP}M#`Jk8=O5#zBmbr2-MIY02HZIR zZM5kBjIQ=v^tWw5UrPr1>X)OVH4_I~vhdoKjX2y@2?uk-9wa{fSNI)|usQ1lG`|6s zZ|sJ>s~K833)_P*Gz4H83Mu|2=j6;y=b)U2a{jocM^So|c!nR-j%BzwFv_!?K6-if z%-+M5?i1MFxCI$iSvcI{gj4XDe`evIT$p+S*uU)~X#SGKPyafC{Mr_nn12R-CyhL# zO)}5Ry~0*((91VU_~*Npd&J>=2+RKUxM}(&YES+Gx~>kGd9KLWnom252CVpJ<0iaZ zwF>)dbKxYWBf#8bERC$;%wtL9y!|P@zTS^#vhOlqufX(hj^oi(esCXX8D}l`fI;H_ zKlu56>I%)narjjX7q3FQZ4>SHz?zi(5@O64-%&R{Y@cLi*$WcNRt* zE6`KC83or|u*kVO7lvN^pKQj5b(^tkv>Mq>XJ8foS)W_rK zR{lrSx3lQc1WvTQiz9cMu(RC;8~t~%{*k&MT{jt+$4A>9Ap2Z1w!Gs;#!3%XJl}>_ zpY6o<7yFU><`fPbk0}04D)tNbjE9_ow#NaK_twDdItl%k_tf{t!uZR6wQ)RW>K1l& zI&t>i5G=$mW!)#WX}Xq>`bO-Me7wM%IynU#Y!Bnz>j7*p@p4@d4y1BlaZlURwU1y( zjUf4K<({xmhtwoe0|`f}_ToOt=UFUgpMl?gH}kH7Vq2&sR87UW3ZEA4YgMyKo{-04 z?ggvp=Xk2-ka~%E%e7Los+KlLZ6f+)z4Te=pWGW%9mn~?gQ}a^?_uqEm3EX+q}Eb( zB5hSI*QWE!!u(RW3w}uLCVt7@N!LPB1B>2tohW0{Cwo`bc*2#~!+aC_3W;Aw{0`oC zD!Io+r_7i2B9}NWHNX1Cs+BlhT_aWz-$$AI(L@~ci3Iw|K|6Et8u8Np+Nr;aI)$SK*2!}Jl5G{|&Nl zifyCt!1Ye*Y@)5VXy-8dAJf(bzDxWhpA=VYZN!xvD3 zQ-3Xe?WVrF)aT`$Snj`Jin@byEJlnl@6We5cY{EV~Q^Mmx)$9KVv@cpa*5BPt-{|87jx|09^ literal 0 HcmV?d00001 diff --git a/static/script.js b/static/script.js new file mode 100644 index 0000000..f331548 --- /dev/null +++ b/static/script.js @@ -0,0 +1,300 @@ +class DownloadTool { + constructor() { + this.form = document.getElementById('downloadForm'); + this.output = document.getElementById('output'); + this.loadUrlsBtn = document.getElementById('loadUrls'); + this.urlListTextarea = document.getElementById('urlList'); + this.downloadUrlBtn = document.getElementById('downloadUrl'); + this.cleanFilesBtn = document.getElementById('cleanFiles'); + this.downloadImageBtn = document.getElementById('downloadImage'); + this.checkIncompleteBtn = document.getElementById('checkIncomplete'); + this.clearOutputBtn = document.getElementById('clearOutput'); + this.proxySelect = document.getElementById('proxy'); + + this.websocket = null; + this.isConnected = false; + + this.initEvents(); + this.connectWebSocket(); + } + + initEvents() { + // 读取URL按钮 + this.loadUrlsBtn.addEventListener('click', () => { + this.loadTargetUrls(); + }); + + // 下载URL按钮 + this.downloadUrlBtn.addEventListener('click', () => { + this.downloadUrls() + }); + + // 下载图片按钮 + this.downloadImageBtn.addEventListener('click', () => { + this.downloadImages() + }); + + // 检查未完成按钮 + this.checkIncompleteBtn.addEventListener('click', () => { + this.checkIncomplete(); + }); + + // 清理文件按钮 + this.cleanFilesBtn.addEventListener('click', () => { + this.cleanFiles(); + }); + + // 清除输出按钮 + this.clearOutputBtn.addEventListener('click', () => { + this.clearOutput(); + }); + } + + connectWebSocket() { + try { + const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:'; + const wsUrl = `${protocol}//${window.location.host}/ws`; + this.websocket = new WebSocket(wsUrl); + + this.websocket.onopen = () => { + this.isConnected = true; + this.showOutput('WebSocket连接已建立,可以接收实时日志', 'success'); + console.log('WebSocket连接已建立'); + }; + + this.websocket.onmessage = (event) => { + try { + const logEntry = JSON.parse(event.data); + this.appendRealtimeLog(logEntry); + } catch (e) { + console.error('解析WebSocket消息失败:', e); + } + }; + + this.websocket.onclose = () => { + this.isConnected = false; + this.showOutput('WebSocket连接已断开,正在尝试重连...', 'error'); + console.log('WebSocket连接已断开'); + // 5秒后尝试重连 + setTimeout(() => this.connectWebSocket(), 5000); + }; + + this.websocket.onerror = (error) => { + console.error('WebSocket错误:', error); + this.showOutput('WebSocket连接错误', 'error'); + }; + } catch (error) { + console.error('创建WebSocket连接失败:', error); + this.showOutput('WebSocket连接失败', 'error'); + } + } + + appendRealtimeLog(logEntry) { + const timestamp = logEntry.time || new Date().toLocaleTimeString(); + const level = logEntry.level || 'INFO'; + const source = logEntry.source || 'system'; + const message = logEntry.message || ''; + + const logLine = `[${timestamp}] [${level}] [${source}] ${message}`; + + // 追加到输出框 + if (this.output.textContent) { + this.output.textContent += '\n' + logLine; + } else { + this.output.textContent = logLine; + } + + // 自动滚动到底部 + this.output.scrollTop = this.output.scrollHeight; + + // 根据日志级别设置样式 + if (level === 'ERROR') { + this.output.classList.add('error'); + } else if (level === 'SUCCESS') { + this.output.classList.add('success'); + } else { + this.output.classList.remove('error', 'success'); + } + } + + async loadTargetUrls() { + try { + this.setLoading(true); + this.showOutput('正在读取 targets.txt...', 'info'); + + const response = await fetch('/load_urls', { + method: 'POST' + }); + + const result = await response.json(); + + if (result.success) { + // 在URL列表文本框中显示读取的URL + this.urlListTextarea.value = result.urls.join('\n'); + this.showOutput(`成功读取 ${result.urls.length} 个URL\n\nURL列表:\n${result.urls.join('\n')}`, 'success'); + } else { + this.showOutput(`读取失败: ${result.message}`, 'error'); + } + } catch (error) { + this.showOutput(`读取URL时出错: ${error.message}`, 'error'); + } finally { + this.setLoading(false); + } + } + + async clearOutput() { + try { + const response = await fetch('/clear', { + method: 'POST' + }); + + const result = await response.json(); + if (result.success) { + this.showOutput('', 'success'); + this.urlListTextarea.value = ''; // 同时清空URL列表 + } + } catch (error) { + this.showOutput(`清除失败: ${error.message}`, 'error'); + } + } + + async downloadUrls() { + try { + const proxy = this.proxySelect.value; + + this.showOutput(`正在抓取画廊链接...\n代理: ${proxy}\n\n注意:此操作可能需要较长时间,请耐心等待...`, 'info'); + + // 使用setTimeout确保UI不被阻塞 + setTimeout(async () => { + try { + const res = await fetch('/download_urls', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ proxy }) + }); + const data = await res.json(); + this.showOutput(data.message, data.success ? 'success' : 'error'); + } catch (error) { + this.showOutput(`抓取画廊链接时出错: ${error.message}`, 'error'); + } + }, 100); + + } catch (error) { + this.showOutput(`抓取画廊链接时出错: ${error.message}`, 'error'); + } + } + + async downloadImages() { + try { + const proxy = this.proxySelect.value; + + this.showOutput(`正在下载图片...\n代理: ${proxy}\n\n注意:此操作可能需要较长时间,请耐心等待...`, 'info'); + + // 使用setTimeout确保UI不被阻塞 + setTimeout(async () => { + try { + const res = await fetch('/download_images', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ proxy }) + }); + const data = await res.json(); + this.showOutput(data.message, data.success ? 'success' : 'error'); + } catch (error) { + this.showOutput(`下载图片时出错: ${error.message}`, 'error'); + } + }, 100); + + } catch (error) { + this.showOutput(`下载图片时出错: ${error.message}`, 'error'); + } + } + + async checkIncomplete() { + try { + this.setLoading(true); + this.showOutput('正在检查未完成文件...', 'info'); + + const response = await fetch('/check_incomplete', { + method: 'POST' + }); + + const result = await response.json(); + + if (result.success) { + let message = `检查完成!\n\n`; + message += `${result.data}`; + this.showOutput(message, 'success'); + } else { + this.showOutput(`检查失败: ${result.message}`, 'error'); + } + } catch (error) { + this.showOutput(`检查未完成文件时出错: ${error.message}`, 'error'); + } finally { + this.setLoading(false); + } + } + + async cleanFiles() { + try { + this.setLoading(true); + this.showOutput('正在清理日志和JSON文件...', 'info'); + + const response = await fetch('/clean_files', { + method: 'POST' + }); + + const result = await response.json(); + + if (result.success) { + let message = `清理完成!成功删除 ${result.deleted_count} 个文件\n\n`; + if (result.deleted_files && result.deleted_files.length > 0) { + message += "已删除的文件:\n" + result.deleted_files.join('\n'); + } + this.showOutput(message, 'success'); + } else { + let message = `清理完成,但有 ${result.error_count} 个文件删除失败\n\n`; + if (result.deleted_files && result.deleted_files.length > 0) { + message += "已删除的文件:\n" + result.deleted_files.join('\n') + '\n\n'; + } + if (result.error_files && result.error_files.length > 0) { + message += "删除失败的文件:\n" + result.error_files.join('\n'); + } + this.showOutput(message, 'error'); + } + } catch (error) { + this.showOutput(`清理文件时出错: ${error.message}`, 'error'); + } finally { + this.setLoading(false); + } + } + + showOutput(message, type = '') { + this.output.textContent = message; + this.output.className = 'output-area'; + if (type) { + this.output.classList.add(type); + } + + // 自动滚动到底部 + this.output.scrollTop = this.output.scrollHeight; + } + + setLoading(loading) { + const buttons = this.form.querySelectorAll('button'); + buttons.forEach(button => { + button.disabled = loading; + }); + + if (loading) { + document.body.classList.add('loading'); + } else { + document.body.classList.remove('loading'); + } + } +} + +// 初始化应用 +document.addEventListener('DOMContentLoaded', () => { + new DownloadTool(); +}); \ No newline at end of file diff --git a/static/style.css b/static/style.css new file mode 100644 index 0000000..0abb935 --- /dev/null +++ b/static/style.css @@ -0,0 +1,217 @@ +* { + margin: 0; + padding: 0; + box-sizing: border-box; +} + +body { + font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; + background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); + min-height: 100vh; + padding: 20px; +} + +.container { + max-width: 800px; + margin: 0 auto; + background: white; + border-radius: 10px; + padding: 30px; + box-shadow: 0 10px 30px rgba(0, 0, 0, 0.2); +} + +h1 { + text-align: center; + color: #333; + margin-bottom: 30px; + font-size: 2.5em; +} + +.form-group { + margin-bottom: 20px; +} + +label { + display: block; + margin-bottom: 5px; + font-weight: 600; + color: #555; +} + +input[type="text"], +input[type="url"], +.proxy-select { + width: 100%; + padding: 12px; + border: 2px solid #ddd; + border-radius: 5px; + font-size: 16px; + transition: border-color 0.3s; +} + +input[type="text"]:focus, +input[type="url"]:focus, +.proxy-select:focus { + outline: none; + border-color: #667eea; +} + +/* URL列表文本框 */ +.url-list { + width: 100%; + height: 120px; + padding: 12px; + border: 2px solid #ddd; + border-radius: 5px; + font-size: 12px; + font-family: 'Courier New', monospace; + background-color: #f8f9fa; + resize: vertical; + color: #555; +} + +.url-list:focus { + outline: none; + border-color: #667eea; +} + +.button-group { + display: flex; + gap: 10px; + margin-bottom: 30px; + flex-wrap: wrap; +} + +.btn { + padding: 12px 24px; + border: none; + border-radius: 5px; + font-size: 16px; + cursor: pointer; + transition: all 0.3s; + font-weight: 600; +} + +.btn-primary { + background: #667eea; + color: white; +} + +.btn-primary:hover { + background: #5a6fd8; + transform: translateY(-2px); +} + +.btn-secondary { + background: #764ba2; + color: white; +} + +.btn-secondary:hover { + background: #6a4190; + transform: translateY(-2px); +} + +.btn-danger { + background: #e74c3c; + color: white; +} + +.btn-danger:hover { + background: #c0392b; + transform: translateY(-2px); +} + +.btn-info { + background: #17a2b8; + color: white; +} + +.btn-info:hover { + background: #138496; + transform: translateY(-2px); +} + +.btn-warning { + background: #ffc107; + color: #212529; +} + +.btn-warning:hover { + background: #e0a800; + transform: translateY(-2px); +} + +.btn-success { + background: #28a745; + color: white; +} + +.btn-success:hover { + background: #218838; + transform: translateY(-2px); +} + +.output-section h3 { + color: #333; + margin-bottom: 10px; +} + +.output-area { + background: #f8f9fa; + border: 1px solid #e9ecef; + border-radius: 5px; + padding: 15px; + min-height: 150px; + max-height: 400px; + overflow-y: auto; + font-family: 'Courier New', monospace; + white-space: pre-wrap; + word-wrap: break-word; +} + +.output-area.success { + border-left: 4px solid #28a745; +} + +.output-area.error { + border-left: 4px solid #dc3545; +} + +.loading { + opacity: 0.7; + pointer-events: none; +} + +@media (max-width: 600px) { + .container { + padding: 20px; + } + + .button-group { + flex-direction: column; + } + + .btn { + width: 100%; + } +} + +.form-row { + display: flex; + gap: 15px; +} + +.form-row .form-group { + flex: 1; +} + +/* 下拉框样式 */ +.proxy-select { + background-color: white; + cursor: pointer; +} + +.proxy-select option { + padding: 8px; +} diff --git a/step1.py b/step1.py new file mode 100644 index 0000000..3fe7300 --- /dev/null +++ b/step1.py @@ -0,0 +1,207 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +异步批量抓取 E-H 画廊图片链接,按专辑保存 json +python eh_crawler.py +""" +from __future__ import annotations + +import asyncio +import json +import logging +import os +import re +import sys +from pathlib import Path +from typing import Dict, List, Optional + +import httpx +from bs4 import BeautifulSoup +from tqdm.asyncio import tqdm_asyncio +from pathlib import Path + +# -------------------- 可配置常量 -------------------- +from config import config + +CONCURRENCY = config.concurrency +MAX_PAGE = config.max_page +RETRY_PER_PAGE = config.retry_per_page +TIMEOUT = httpx.Timeout(config.timeout) +IMG_SELECTOR = "#gdt" # 图片入口区域 +FAILED_RECORD = "data/failed_keys.json" +LOG_LEVEL = getattr(logging, config.log_level.upper()) +# ---------------------------------------------------- + +# 确保数据目录存在 +if not os.path.exists("data"): + os.mkdir("data") + +# 使用统一的日志配置 +from logger import get_logger +from realtime_logger import realtime_logger +log = get_logger("step1", "crawl.log") + +# 预编译正则 +ILLEGAL_CHARS = re.compile(r'[<>:"/\\|?*\x00-\x1F]') + + +# -------------------- 工具函数 -------------------- +def clean_folder_name(title: str) -> str: + """清洗文件夹名""" + return ILLEGAL_CHARS.sub("_", title).replace(" ", "").replace("_", "").strip() or "gallery" + + +def load_targets() -> List[str]: + """读取 targets.txt""" + tgt = Path("data/targets.txt") + with open(tgt, 'r', encoding='utf-8') as f: + urls = [line.strip() for line in f.readlines() if line.strip()] + + lines = [] + for ln in tgt.read_text(encoding="utf-8").splitlines(): + url = ln.strip() + if url and not url.startswith('#'): + lines.append(url) + if not lines: + log.error("targets.txt 为空,请先填写 URL") + return + return list(set(lines)) # 去重 + + +def load_failed() -> List[str]: + if Path(FAILED_RECORD).exists(): + try: + return json.loads(Path(FAILED_RECORD).read_text(encoding="utf-8")) + except Exception as exc: + log.warning(f"加载失败记录失败 -> {exc}") + return [] + + +def save_failed(keys: List[str]) -> None: + Path(FAILED_RECORD).write_text(json.dumps(keys, ensure_ascii=False, indent=2), encoding="utf-8") + + +# -------------------- 爬虫核心 -------------------- +async def fetch_page(client: httpx.AsyncClient, url: str) -> Optional[str]: + """获取单页 HTML""" + for attempt in range(1, RETRY_PER_PAGE + 1): + try: + resp = await client.get(url) + resp.raise_for_status() + return resp.text + except httpx.HTTPError as exc: + log.error(f"[{attempt}/{RETRY_PER_PAGE}] 请求失败 {url} -> {exc}") + await asyncio.sleep(2 ** attempt) + return None + + +async def crawl_single_gallery( + client: httpx.AsyncClient, sem: asyncio.Semaphore, gallery_url: str +) -> bool: + """抓取单个画廊,成功返回 True""" + async with sem: + base_url = gallery_url.rstrip("/") + key = base_url.split("/")[-1] # 用最后一截当 key + json_name = f"{key}.json" + + folder_path: Optional[Path] = None + json_data: Dict[str, str] = {} + img_count = 1 + last_page = False + + for page in range(MAX_PAGE): + if last_page: + break + url = f"{base_url}?p={page}" + html = await fetch_page(client, url) + if html is None: + continue + + soup = BeautifulSoup(html, "lxml") + title = soup.title.string if soup.title else "gallery" + clean_title = clean_folder_name(title) + folder_path = Path("data/downloads") / clean_title + folder_path.mkdir(parents=True, exist_ok=True) + + # 如果 json 已存在则跳过整个画廊 + json_path = folder_path / json_name + if json_path.exists(): + log.info(f"{json_name} 已存在,跳过") + return True + + log.info(f"当前页码:{page + 1} {url}") + + selected = soup.select_one(IMG_SELECTOR) + if not selected: + log.warning(f"未找到选择器 {IMG_SELECTOR}") + continue + + links = re.findall(r' {json_path} ({len(json_data)} 张)") + # 发送实时日志 + try: + realtime_logger.broadcast_log_sync(f"画廊 {key} 抓取完成,共 {len(json_data)} 张图片", "SUCCESS", "step1") + except Exception as e: + log.warning(f"发送实时日志失败: {e}") + return True + else: + log.warning(f"{key} 未解析到任何图片链接") + # 发送实时日志 + try: + realtime_logger.broadcast_log_sync(f"画廊 {key} 未解析到任何图片链接", "WARNING", "step1") + except Exception as e: + log.warning(f"发送实时日志失败: {e}") + return False + + +# -------------------- 主流程 -------------------- +async def main(proxy: str | None = None) -> None: + targets = load_targets() + failed = load_failed() + if failed: + log.info(f"优先重试上次失败画廊: {len(failed)} 个") + all_urls = list(set(targets + failed)) + + print(proxy) + limits = httpx.Limits(max_keepalive_connections=20, max_connections=50) + async with httpx.AsyncClient( + limits=limits, timeout=TIMEOUT, proxies=proxy, verify=True + ) as client: + sem = asyncio.Semaphore(CONCURRENCY) + results = await tqdm_asyncio.gather( + *[crawl_single_gallery(client, sem, u) for u in all_urls], + desc="Galleries", + total=len(all_urls), + ) + + # 失败持久化 + new_failed = [u for u, ok in zip(all_urls, results) if not ok] + if new_failed: + save_failed(new_failed) + log.warning(f"本轮仍有 {len(new_failed)} 个画廊失败,已写入 {FAILED_RECORD}") + else: + Path(FAILED_RECORD).unlink(missing_ok=True) + log.info("全部画廊抓取完成!") + + +if __name__ == "__main__": + try: + asyncio.run(main()) + except KeyboardInterrupt: + log.info("用户中断,抓取结束") \ No newline at end of file diff --git a/step2.py b/step2.py new file mode 100644 index 0000000..a6bacd8 --- /dev/null +++ b/step2.py @@ -0,0 +1,187 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +异步批量下载 EH 画廊真实图片 +python download_images.py +""" +from __future__ import annotations + +import asyncio +import json +import logging +import os +import re +import sys +from pathlib import Path +from typing import Dict, List + +import aiofiles +import httpx +from pathlib import Path +from tqdm.asyncio import tqdm_asyncio + +# -------------------- 可配置常量 -------------------- +from config import config + +CONCURRENCY = config.concurrency +RETRY_PER_IMG = config.retry_per_image +TIMEOUT = httpx.Timeout(config.image_timeout) +FAILED_RECORD = "data/failed_downloads.json" +LOG_LEVEL = getattr(logging, config.log_level.upper()) +# ---------------------------------------------------- + +# 确保数据目录存在 +if not os.path.exists("data"): + os.mkdir("data") + +# 使用统一的日志配置 +from logger import get_logger +from realtime_logger import realtime_logger +log = get_logger("step2", "download.log") + +# 预编译正则 +IMG_URL_RE = re.compile(r' List[Dict[str, str]]: + if Path(FAILED_RECORD).exists(): + try: + return json.loads(Path(FAILED_RECORD).read_text(encoding="utf-8")) + except Exception as exc: + log.warning(f"加载失败记录失败 -> {exc}") + return [] + + +def save_failed(failed: List[Dict[str, str]]) -> None: + Path(FAILED_RECORD).write_text(json.dumps(failed, ensure_ascii=False, indent=2), encoding="utf-8") + + +# -------------------- 下载核心 -------------------- +async def download_one( + client: httpx.AsyncClient, sem: asyncio.Semaphore, item: Dict[str, str] +) -> bool: + """下载单张图,成功返回 True""" + img_path, img_url = Path(item["img_path"]), item["img_url"] + + await sem.acquire() + try: + for attempt in range(1, RETRY_PER_IMG + 1): + try: + # 1. 获取详情页 + resp = await client.get(img_url) + resp.raise_for_status() + real_url_match = IMG_URL_RE.search(resp.text) + if not real_url_match: + log.warning(f"未解析到真实图片链接: {img_url}") + return False # <- 这里不会触发 await + real_url = real_url_match.group(1) + + # 2. 下载真实图片(流式) + ext_match = EXT_RE.search(real_url) + ext = ext_match.group(1).lower() if ext_match else "jpg" + final_path = img_path.with_suffix(f".{ext}") + + if final_path.exists(): + log.info(f"已存在,跳过: {final_path.name}") + return True + + async with client.stream("GET", real_url) as img_resp: + img_resp.raise_for_status() + final_path.parent.mkdir(parents=True, exist_ok=True) + async with aiofiles.open(final_path, "wb") as fp: + async for chunk in img_resp.aiter_bytes(chunk_size=65536): + await fp.write(chunk) + + # log.info(f"[OK] {final_path.name}") + # 发送实时日志 + try: + realtime_logger.broadcast_log_sync(f"下载完成: {final_path.name}", "SUCCESS", "step2") + except Exception as e: + log.warning(f"发送实时日志失败: {e}") + return True + + except httpx.HTTPStatusError as exc: + if exc.response.status_code == 429: + wait = 2 ** (attempt - 1) + log.warning(f"[429] 等待 {wait}s 后重试({attempt}/{RETRY_PER_IMG})") + await asyncio.sleep(wait) + else: + log.error(f"[HTTP {exc.response.status_code}] {img_url}") + break + except Exception as exc: + log.error(f"[ERROR] {img_url} -> {exc} ({attempt}/{RETRY_PER_IMG})") + await asyncio.sleep(1) + + return False + finally: + sem.release() + + +# -------------------- 扫描待下载列表 -------------------- +async def scan_tasks() -> List[Dict[str, str]]: + """扫描 downloads/ 下所有 json,返回待下载列表""" + result = [] + root = Path("data/downloads") + if not root.exists(): + return result + + for json_path in root.rglob("*.json"): + folder = json_path.parent + try: + data: Dict[str, str] = json.loads(json_path.read_text(encoding="utf-8")) + except Exception as exc: + log.warning(f"读取 json 失败 {json_path} -> {exc}") + continue + + for img_name, img_url in data.items(): + img_path = folder / img_name # 无后缀 + # 判断任意后缀是否存在 + exists = False + for ext in (".jpg", ".jpeg", ".png", ".gif", ".webp"): + if img_path.with_suffix(ext).exists(): + exists = True + break + if not exists: + result.append({"img_path": str(img_path), "img_url": img_url}) + + return result + + +# -------------------- 主流程 -------------------- +async def main(proxy: str | None = None) -> None: + # 1. 优先重试上次失败 + failed_tasks = load_failed() + if failed_tasks: + log.info(f"优先重试上次失败任务: {len(failed_tasks)} 张") + + tasks = failed_tasks + await scan_tasks() + if not tasks: + log.info("没有需要下载的图片,收工!") + return + + limits = httpx.Limits(max_keepalive_connections=20, max_connections=50) + async with httpx.AsyncClient(limits=limits, timeout=TIMEOUT, proxies=proxy, verify=True) as client: + sem = asyncio.Semaphore(CONCURRENCY) + results = await tqdm_asyncio.gather( + *[download_one(client, sem, t) for t in tasks], + desc="Downloading", + total=len(tasks), + ) + + # 统计 & 持久化新失败 + failed_again = [t for t, ok in zip(tasks, results) if not ok] + if failed_again: + save_failed(failed_again) + log.warning(f"本轮仍有 {len(failed_again)} 张下载失败,已写入 {FAILED_RECORD}") + else: + Path(FAILED_RECORD).unlink(missing_ok=True) + log.info("全部下载完成!") + + +if __name__ == "__main__": + try: + asyncio.run(main()) + except KeyboardInterrupt: + log.info("用户中断,下载结束") \ No newline at end of file diff --git a/templates/index.html b/templates/index.html new file mode 100644 index 0000000..de451ee --- /dev/null +++ b/templates/index.html @@ -0,0 +1,75 @@ + + + + + + EH-Downloader + + + +
+

EH

+ +
+
+ + +
+ +
+ +
+ +
+ + +
+ +
+ + + + + +
+
+ +
+

操作日志

+

+        
+ +

+ +
+

使用说明

+
+

工具使用步骤:

+
    +
  1. 从下拉框选择代理设置(代理配置保存在项目根目录的proxy.txt中)
  2. +
  3. 将URL复制到项目目录下的data/targets.txt中,一个画廊一个URL
  4. +
  5. 这里获取需要下载的画廊URL
  6. +
  7. 点击"读取目标URL"加载 targets.txt 中的URL列表
  8. +
  9. 点击"下载URL"开始抓取画廊链接
  10. +
  11. 点击"下载图片"开始下载图片文件
  12. +
  13. 使用"检查未完成"查看下载进度
  14. +
  15. 使用"清理日志和JSON文件"清理临时文件
  16. +
+

注意事项:

+
    +
  • 请确保代理设置正确,且 targets.txt 文件中已添加目标URL
  • +
  • 代理配置:在项目根目录的proxy.txt文件中,每行一个代理地址,格式为 IP:端口
  • +
  • 下载的图片会保存在 data/downloads 目录下,按画廊名称分文件夹存储
  • +
  • 如果下载中断,可以重新运行"下载图片"继续未完成的下载
  • +
+
+
+
+ + + + \ No newline at end of file diff --git a/utils.py b/utils.py new file mode 100644 index 0000000..8cf4ea3 --- /dev/null +++ b/utils.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +工具函数模块 +""" +from typing import Optional + +from logger import get_logger +from step1 import main as step1_main +from step2 import main as step2_main + +# 设置日志 +logger = get_logger("utils") + +async def run_step1(proxy: Optional[str] = None) -> str: + """执行第一步:抓取画廊链接""" + try: + logger.info("开始执行画廊链接抓取") + await step1_main(proxy) + logger.info("画廊链接抓取完成") + return "画廊链接抓取完成!" + except Exception as e: + logger.exception("step1 执行失败") + return f"抓取失败:{e}" + +async def run_step2(proxy: Optional[str] = None) -> str: + """执行第二步:下载图片""" + try: + logger.info("开始执行图片下载") + await step2_main(proxy) + logger.info("图片下载完成") + return "图片下载完成!" + except Exception as e: + logger.exception("step2 执行失败") + return f"下载失败:{e}"