first commit

main
jack 4 weeks ago
commit f557aed15a
  1. 67
      .gitignore
  2. 17
      Dockerfile
  3. 194
      Readme.md
  4. 88
      build.sh
  5. 92
      config.py
  6. 11
      docker-compose.yaml
  7. 62
      downloader.py
  8. 88
      logger.py
  9. 305
      main.py
  10. 81
      performance.py
  11. 117
      realtime_logger.py
  12. 32
      requirements.txt
  13. 2
      src/__init__.py
  14. 2
      src/core/__init__.py
  15. 83
      src/core/step1.py
  16. 63
      src/core/step2.py
  17. 2
      src/logging/__init__.py
  18. 59
      src/logging/logger.py
  19. 77
      src/logging/realtime_logger.py
  20. 2
      src/services/__init__.py
  21. 43
      src/services/downloader.py
  22. 2
      src/utils/__init__.py
  23. 57
      src/utils/performance.py
  24. 25
      src/utils/utils.py
  25. 40
      start.py
  26. BIN
      static/favicon.ico
  27. 300
      static/script.js
  28. 217
      static/style.css
  29. 207
      step1.py
  30. 187
      step2.py
  31. 75
      templates/index.html
  32. 35
      utils.py

67
.gitignore vendored

@ -0,0 +1,67 @@
.DS_Store
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
env/
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
*.egg-info/
.installed.cfg
*.egg
.idea/*
xml_files/
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*,cover
# Translations
*.mo
*.pot
# Django stuff:
*.log
# Sphinx documentation
docs/_build/
# PyBuilder
target/
other/split_clash_config/split_config
ai_news/save_data
daily/*.txt
data/
eh.tar

@ -0,0 +1,17 @@
FROM python:3.11-slim
WORKDIR /app
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt \
-i https://pypi.tuna.tsinghua.edu.cn/simple \
--trusted-host pypi.tuna.tsinghua.edu.cn
COPY . .
RUN mkdir -p data/downloads \
&& touch data/targets.txt data/proxy.txt \
&& chmod -R 755 data
EXPOSE 8000
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]

@ -0,0 +1,194 @@
# EH-Downloader
一个基于 FastAPI 的 E-Hentai 画廊下载工具,支持异步批量下载画廊图片。
## 功能特性
- 🚀 **异步下载**: 基于 asyncio 的高性能异步下载
- 🌐 **Web界面**: 现代化的 Web 用户界面
- 🔧 **代理支持**: 支持 HTTP 代理配置
- 📁 **智能管理**: 自动创建目录结构,按画廊分文件夹存储
- 🔄 **断点续传**: 支持中断后继续下载
- 📊 **进度监控**: 实时显示下载进度和状态
- 🧹 **自动清理**: 一键清理临时文件和日志
## 快速开始
### 环境要求
- Python 3.11+
- 网络代理(可选,用于访问 E-Hentai)
### 安装依赖
```bash
pip install -r requirements.txt
```
### 运行应用
```bash
python main.py
```
访问 `http://localhost:8000` 使用 Web 界面。
### Docker 部署
```bash
# 构建镜像
docker build -t eh-downloader .
# 运行容器
docker-compose up -d
```
## 使用说明
### 1. 配置代理
在项目根目录的 `proxy.txt` 文件中添加代理配置,每行一个:
```
127.0.0.1:7890
192.168.1.100:8080
```
### 2. 添加目标URL
`data/targets.txt` 文件中添加要下载的画廊URL,每行一个:
```
https://e-hentai.org/g/1234567/abcdef123456
https://e-hentai.org/g/2345678/bcdefg234567
```
### 3. 开始下载
1. 打开 Web 界面
2. 选择代理设置
3. 点击"读取目标URL"加载URL列表
4. 点击"下载URL"抓取画廊链接
5. 点击"下载图片"开始下载图片
## 项目结构
```
ehentai-fastapi/
├── main.py # 主应用文件
├── config.py # 配置管理
├── logger.py # 日志管理
├── utils.py # 工具函数
├── step1.py # 画廊链接抓取
├── step2.py # 图片下载
├── downloader.py # 下载器类
├── templates/ # HTML模板
├── static/ # 静态资源
├── data/ # 数据目录
│ ├── targets.txt # 目标URL列表
│ ├── downloads/ # 下载文件存储
│ └── *.log # 日志文件
├── proxy.txt # 代理配置
├── requirements.txt # 依赖列表
├── Dockerfile # Docker配置
└── docker-compose.yaml # Docker Compose配置
```
## 配置说明
### 应用配置 (config.py)
- `concurrency`: 并发数,默认20
- `max_page`: 单专辑最大翻页数,默认100
- `retry_per_page`: 单页重试次数,默认5
- `retry_per_image`: 单图重试次数,默认3
- `timeout`: 请求超时时间,默认10秒
- `image_timeout`: 图片下载超时时间,默认15秒
### 日志配置
- 日志级别:INFO
- 日志文件:`data/app.log`, `data/crawl.log`, `data/download.log`
- 日志格式:`[时间] [级别] 消息`
## API 接口
### GET /
主页面
### POST /load_urls
读取目标URL列表
### POST /download_urls
开始抓取画廊链接
### POST /download_images
开始下载图片
### POST /check_incomplete
检查未完成的下载
### POST /clean_files
清理临时文件
### POST /clear
清除输出
## 注意事项
1. **网络要求**: 需要稳定的网络连接和合适的代理
2. **存储空间**: 确保有足够的磁盘空间存储下载的图片
3. **合规使用**: 请遵守相关法律法规和网站使用条款
4. **代理配置**: 建议使用稳定的代理服务以确保下载成功率
## 故障排除
### 常见问题
1. **下载失败**: 检查代理配置和网络连接
2. **文件损坏**: 重新下载或检查存储空间
3. **权限错误**: 确保应用有读写权限
4. **内存不足**: 降低并发数或增加系统内存
### 日志查看
```bash
# 查看应用日志
tail -f data/app.log
# 查看抓取日志
tail -f data/crawl.log
# 查看下载日志
tail -f data/download.log
```
## 开发说明
### 代码结构
- **main.py**: FastAPI 应用主文件
- **config.py**: 配置管理模块
- **logger.py**: 日志管理模块
- **utils.py**: 工具函数模块
- **step1.py**: 画廊链接抓取逻辑
- **step2.py**: 图片下载逻辑
### 扩展功能
1. 添加新的下载源
2. 支持更多图片格式
3. 实现下载队列管理
4. 添加用户认证系统
## 许可证
本项目仅供学习和研究使用,请遵守相关法律法规。
## 更新日志
### v1.0.0
- 初始版本发布
- 支持基本的画廊下载功能
- Web界面和API接口
- Docker支持

@ -0,0 +1,88 @@
#!/bin/bash
# Docker 镜像构建脚本
# 使用方法: ./build.sh [选项]
IMAGE_NAME="eh-downloader"
TAG="latest"
DOCKERFILE="Dockerfile"
# 显示帮助信息
show_help() {
echo "Docker 镜像构建脚本"
echo ""
echo "使用方法: $0 [选项]"
echo ""
echo "选项:"
echo " -n, --name NAME 镜像名称 (默认: $IMAGE_NAME)"
echo " -t, --tag TAG 镜像标签 (默认: $TAG)"
echo " -f, --file FILE Dockerfile 路径 (默认: $DOCKERFILE)"
echo " --no-cache 构建时不使用缓存"
echo " -h, --help 显示此帮助信息"
echo ""
echo "示例:"
echo " $0 # 使用默认配置构建"
echo " $0 -n myapp -t v1.0 # 指定名称和标签"
echo " $0 --no-cache # 不使用缓存构建"
}
# 解析命令行参数
while [[ $# -gt 0 ]]; do
case $1 in
-n|--name)
IMAGE_NAME="$2"
shift 2
;;
-t|--tag)
TAG="$2"
shift 2
;;
-f|--file)
DOCKERFILE="$2"
shift 2
;;
--no-cache)
NO_CACHE="--no-cache"
shift
;;
-h|--help)
show_help
exit 0
;;
*)
echo "❌ 未知选项: $1"
show_help
exit 1
;;
esac
done
# 检查 Dockerfile 是否存在
if [ ! -f "$DOCKERFILE" ]; then
echo "❌ Dockerfile 不存在: $DOCKERFILE"
exit 1
fi
echo "🚀 开始构建 Docker 镜像..."
echo "📦 镜像: $IMAGE_NAME:$TAG"
echo "📄 Dockerfile: $DOCKERFILE"
echo "⏰ 开始时间: $(date)"
# 执行构建命令
docker build $NO_CACHE -t $IMAGE_NAME:$TAG -f $DOCKERFILE .
# 检查构建结果
if [ $? -eq 0 ]; then
echo ""
echo "✅ 镜像构建成功!"
echo "📊 镜像信息:"
docker images | grep $IMAGE_NAME
echo ""
echo "🎯 下一步操作:"
echo " 1. 使用 docker-compose up 启动服务"
echo " 2. 或者使用 docker run -p 8000:8000 -v ./data:/app/data $IMAGE_NAME:$TAG 运行容器"
else
echo "❌ 镜像构建失败!"
exit 1
fi

@ -0,0 +1,92 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
配置管理模块
"""
import os
from pathlib import Path
from typing import List, Optional
from pydantic import BaseModel, Field
class AppConfig(BaseModel):
"""应用配置"""
# 基础配置
app_name: str = "EH-Downloader"
app_version: str = "1.0.0"
debug: bool = False
# 服务器配置
host: str = "0.0.0.0"
port: int = 8000
# 数据目录配置
data_dir: str = "data"
downloads_dir: str = "data/downloads"
targets_file: str = "data/targets.txt"
proxy_file: str = "data/proxy.txt"
# 爬虫配置
concurrency: int = 20
max_page: int = 100
retry_per_page: int = 5
retry_per_image: int = 3
timeout: float = 10.0
image_timeout: float = 15.0
# 日志配置
log_level: str = "INFO"
log_format: str = "[%(asctime)s] [%(levelname)s] %(message)s"
# 文件清理配置
cleanup_patterns: List[str] = ["**/*.log", "**/*.json"]
cleanup_exclude: List[str] = ["data/targets.txt"]
def __init__(self, **kwargs):
super().__init__(**kwargs)
# 确保目录存在
self._ensure_directories()
def _ensure_directories(self):
"""确保必要的目录存在"""
Path(self.data_dir).mkdir(exist_ok=True)
Path(self.downloads_dir).mkdir(parents=True, exist_ok=True)
@property
def targets_path(self) -> Path:
"""获取targets文件路径"""
return Path(self.targets_file)
@property
def proxy_path(self) -> Path:
"""获取proxy文件路径"""
return Path(self.proxy_file)
def get_proxies(self) -> List[str]:
"""读取代理列表"""
if not self.proxy_path.exists():
return ["127.0.0.1:7890"]
try:
with open(self.proxy_path, 'r', encoding='utf-8') as f:
proxies = [line.strip() for line in f.readlines() if line.strip()]
return proxies if proxies else ["127.0.0.1:7890"]
except Exception:
return ["127.0.0.1:7890"]
def get_targets(self) -> List[str]:
"""读取目标URL列表"""
if not self.targets_path.exists():
return []
try:
with open(self.targets_path, 'r', encoding='utf-8') as f:
urls = [line.strip() for line in f.readlines() if line.strip()]
# 过滤掉注释行
return [url for url in urls if url and not url.startswith('#')]
except Exception:
return []
# 全局配置实例
config = AppConfig()

@ -0,0 +1,11 @@
services:
eh-downloader:
image: eh-downloader:latest
container_name: eh-downloader
ports:
- "58001:8000"
volumes:
- ./data:/app/data
restart: unless-stopped
environment:
- PYTHONUNBUFFERED=1

@ -0,0 +1,62 @@
import aiofiles
import httpx
from typing import Optional
import os
class Downloader:
def __init__(self):
self.output_dir = "downloads"
os.makedirs(self.output_dir, exist_ok=True)
async def download(self, proxy_str: str, url: str) -> str:
"""
下载文件的主要逻辑
"""
try:
# 如果proxy_str不为空,构建代理配置
proxy = None
if proxy_str and ":" in proxy_str:
ip, port = proxy_str.split(":", 1)
proxy = f"http://{ip}:{port}"
# 使用 httpx 异步下载
async with httpx.AsyncClient(proxies=proxy, timeout=30.0) as client:
response = await client.get(url)
response.raise_for_status()
# 获取文件名
filename = self._get_filename(url, response)
filepath = os.path.join(self.output_dir, filename)
# 保存文件
async with aiofiles.open(filepath, 'wb') as f:
await f.write(response.content)
return f"下载成功: {filename}\n保存路径: {filepath}\n文件大小: {len(response.content)} bytes"
except Exception as e:
raise Exception(f"下载过程中出错: {str(e)}")
def _get_filename(self, url: str, response: httpx.Response) -> str:
"""从 URL 或响应头中获取文件名"""
# 从 URL 中提取文件名
if '/' in url:
filename = url.split('/')[-1]
if '?' in filename:
filename = filename.split('?')[0]
else:
filename = "downloaded_file"
# 如果没有扩展名,尝试从 Content-Type 推断
if '.' not in filename:
content_type = response.headers.get('content-type', '')
if 'image' in content_type:
ext = content_type.split('/')[-1]
filename = f"{filename}.{ext}"
return filename or "downloaded_file"
async def download_image(self, proxy_str: str, url: str) -> str:
"""专门下载图片的方法"""
# 这里可以添加图片下载的特殊逻辑
return await self.download(proxy_str, url)

@ -0,0 +1,88 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
日志管理模块
"""
import logging
import sys
from pathlib import Path
from typing import Optional
from config import config
class LoggerManager:
"""日志管理器"""
_loggers = {}
@classmethod
def get_logger(cls, name: str, log_file: Optional[str] = None) -> logging.Logger:
"""获取日志记录器"""
if name in cls._loggers:
return cls._loggers[name]
logger = logging.getLogger(name)
logger.setLevel(getattr(logging, config.log_level.upper()))
# 避免重复添加处理器
if logger.handlers:
return logger
# 控制台处理器
console_handler = logging.StreamHandler(sys.stdout)
console_handler.setLevel(getattr(logging, config.log_level.upper()))
console_formatter = logging.Formatter(config.log_format)
console_handler.setFormatter(console_formatter)
logger.addHandler(console_handler)
# 文件处理器
if log_file:
log_path = Path(config.data_dir) / log_file
file_handler = logging.FileHandler(log_path, encoding='utf-8')
file_handler.setLevel(getattr(logging, config.log_level.upper()))
file_formatter = logging.Formatter(config.log_format)
file_handler.setFormatter(file_formatter)
logger.addHandler(file_handler)
# WebSocket 实时日志处理器
logger.addHandler(WebSocketLogHandler())
cls._loggers[name] = logger
return logger
@classmethod
def setup_root_logger(cls):
"""设置根日志记录器"""
logging.basicConfig(
level=getattr(logging, config.log_level.upper()),
format=config.log_format,
handlers=[
logging.StreamHandler(sys.stdout),
logging.FileHandler(Path(config.data_dir) / "app.log", encoding='utf-8'),
WebSocketLogHandler(),
]
)
# 便捷函数
def get_logger(name: str, log_file: Optional[str] = None) -> logging.Logger:
"""获取日志记录器的便捷函数"""
return LoggerManager.get_logger(name, log_file)
class WebSocketLogHandler(logging.Handler):
"""将日志通过实时日志器广播到 WebSocket 客户端"""
def emit(self, record: logging.LogRecord) -> None:
try:
message = self.format(record)
level = record.levelname
source = record.name
# 走同步接口,内部会尝试调度到事件循环
# 延迟导入,避免循环依赖
from realtime_logger import realtime_logger
realtime_logger.broadcast_log_sync(message, level, source)
except Exception:
# 保证日志不因 WebSocket 发送失败而中断
pass

@ -0,0 +1,305 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
EH-Downloader 主应用
"""
import glob
import os
from pathlib import Path
from typing import List
from fastapi import FastAPI, Request, HTTPException, WebSocket, WebSocketDisconnect
from fastapi.staticfiles import StaticFiles
from fastapi.templating import Jinja2Templates
from fastapi.responses import JSONResponse, FileResponse
from pydantic import BaseModel
import uvicorn
import asyncio
import threading
import json
from config import config
from logger import get_logger
from realtime_logger import realtime_logger
import step2
from utils import run_step1, run_step2
# 设置日志
logger = get_logger("main", "app.log")
app = FastAPI(
title=config.app_name,
version=config.app_version,
description="E-Hentai 画廊下载工具"
)
@app.on_event("startup")
async def startup_event():
"""应用启动事件"""
logger.info(f"启动 {config.app_name} v{config.app_version}")
# 注册事件循环到实时日志器,便于跨线程广播
try:
realtime_logger.set_loop(asyncio.get_running_loop())
except RuntimeError:
# 若获取失败则忽略
pass
# 确保目录存在
config._ensure_directories()
# 创建默认targets.txt文件
if not config.targets_path.exists():
with open(config.targets_path, 'w', encoding='utf-8') as f:
f.write("# 在这里添加目标URL,每行一个\n")
f.write("# 示例:\n")
f.write("https://e-hentai.org/g/3550066/47d6393550\n")
logger.info(f"创建文件: {config.targets_path}")
# 创建默认proxy.txt文件
if not config.proxy_path.exists():
with open(config.proxy_path, 'w', encoding='utf-8') as f:
f.write("127.0.0.1:7890\n")
logger.info(f"创建文件: {config.proxy_path}")
logger.info("应用启动完成")
# 挂载静态文件和模板
app.mount("/static", StaticFiles(directory="static"), name="static")
templates = Jinja2Templates(directory="templates")
# WebSocket 路由
@app.websocket("/ws")
async def websocket_endpoint(websocket: WebSocket):
"""WebSocket连接处理"""
await websocket.accept()
realtime_logger.add_connection(websocket)
try:
# 发送最近的日志
recent_logs = await realtime_logger.get_recent_logs(20)
for log_entry in recent_logs:
await websocket.send_text(json.dumps(log_entry, ensure_ascii=False))
# 保持连接
while True:
try:
# 等待客户端消息(心跳检测)
data = await websocket.receive_text()
if data == "ping":
await websocket.send_text("pong")
except WebSocketDisconnect:
break
except Exception as e:
logger.error(f"WebSocket错误: {e}")
finally:
realtime_logger.remove_connection(websocket)
# favicon 路由
@app.get("/favicon.ico", include_in_schema=False)
async def favicon():
return FileResponse("static/favicon.ico")
@app.get("/")
async def home(request: Request):
"""主页面"""
try:
proxies = config.get_proxies()
return templates.TemplateResponse("index.html", {
"request": request,
"proxies": proxies,
"default_proxy": proxies[0] if proxies else "127.0.0.1:7890"
})
except Exception as e:
logger.error(f"渲染主页失败: {e}")
raise HTTPException(status_code=500, detail="服务器内部错误")
@app.post("/load_urls")
async def load_urls():
"""读取 targets.txt 文件中的URL"""
try:
urls = config.get_targets()
if not urls:
return JSONResponse({
"success": True,
"message": "targets.txt 文件为空,请在data/targets.txt中添加URL",
"urls": []
})
logger.info(f"成功读取 {len(urls)} 个URL")
return JSONResponse({
"success": True,
"message": f"成功读取 {len(urls)} 个URL",
"urls": urls
})
except Exception as e:
logger.error(f"读取URL失败: {e}")
return JSONResponse({
"success": False,
"message": f"读取文件时出错: {str(e)}",
"urls": []
})
@app.post("/clear")
async def clear_output():
"""清除输出"""
return JSONResponse({
"success": True,
"message": "输出已清除",
"output": ""
})
class ProxyRequest(BaseModel):
proxy: str # 修改为单个proxy字段
@app.post("/download_urls")
async def download_urls(req: ProxyRequest):
"""下载画廊链接"""
try:
# 解析proxy字符串为ip和port
if ":" in req.proxy:
ip, port = req.proxy.split(":", 1)
proxy = f"http://{ip}:{port}"
else:
proxy = None
# 发送实时日志
await realtime_logger.broadcast_log(f"开始抓取画廊链接,代理: {proxy}", "INFO", "step1")
# 在后台线程中执行,避免阻塞
def run_step1_sync():
import asyncio
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
return loop.run_until_complete(run_step1(proxy))
finally:
loop.close()
# 使用线程池执行
import concurrent.futures
with concurrent.futures.ThreadPoolExecutor() as executor:
future = executor.submit(run_step1_sync)
msg = future.result()
await realtime_logger.broadcast_log(f"画廊链接抓取完成: {msg}", "SUCCESS", "step1")
return JSONResponse({"success": True, "message": msg})
except Exception as e:
await realtime_logger.broadcast_log(f"抓取画廊链接失败: {e}", "ERROR", "step1")
logger.error(f"抓取画廊链接失败: {e}")
return JSONResponse({"success": False, "message": f"抓取失败: {str(e)}"})
@app.post("/download_images")
async def download_images(req: ProxyRequest):
"""下载图片"""
try:
# 解析proxy字符串为ip和port
if ":" in req.proxy:
ip, port = req.proxy.split(":", 1)
proxy = f"http://{ip}:{port}"
else:
proxy = None
# 发送实时日志
await realtime_logger.broadcast_log(f"开始下载图片,代理: {proxy}", "INFO", "step2")
# 在后台线程中执行,避免阻塞
def run_step2_sync():
import asyncio
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
return loop.run_until_complete(run_step2(proxy))
finally:
loop.close()
# 使用线程池执行
import concurrent.futures
with concurrent.futures.ThreadPoolExecutor() as executor:
future = executor.submit(run_step2_sync)
msg = future.result()
await realtime_logger.broadcast_log(f"图片下载完成: {msg}", "SUCCESS", "step2")
return JSONResponse({"success": True, "message": msg})
except Exception as e:
await realtime_logger.broadcast_log(f"下载图片失败: {e}", "ERROR", "step2")
logger.error(f"下载图片失败: {e}")
return JSONResponse({"success": False, "message": f"下载失败: {str(e)}"})
@app.post("/clean_files")
async def clean_files():
"""清理项目目录下的所有 .log 和 .json 文件"""
try:
deleted_files = []
error_files = []
# 使用配置中的清理模式
for pattern in config.cleanup_patterns:
for file_path in glob.glob(pattern, recursive=True):
try:
# 跳过排除的文件
if file_path in config.cleanup_exclude:
continue
os.remove(file_path)
deleted_files.append(file_path)
logger.info(f"已删除文件: {file_path}")
except Exception as e:
error_files.append(f"{file_path}: {str(e)}")
logger.error(f"删除文件失败 {file_path}: {str(e)}")
if error_files:
logger.warning(f"清理完成,但部分文件删除失败: {len(error_files)}")
return JSONResponse({
"success": False,
"message": f"清理完成,但部分文件删除失败",
"deleted_count": len(deleted_files),
"error_count": len(error_files),
"deleted_files": deleted_files,
"error_files": error_files
})
else:
logger.info(f"成功清理 {len(deleted_files)} 个文件")
return JSONResponse({
"success": True,
"message": f"成功清理 {len(deleted_files)} 个文件",
"deleted_count": len(deleted_files),
"error_count": 0,
"deleted_files": deleted_files
})
except Exception as e:
logger.error(f"清理过程中出错: {e}")
return JSONResponse({
"success": False,
"message": f"清理过程中出错: {str(e)}",
"deleted_count": 0,
"error_count": 0
})
@app.post("/check_incomplete")
async def check_incomplete():
"""检查未完成文件"""
try:
result = await step2.scan_tasks()
logger.info(f"检查未完成文件: {len(result)}")
return JSONResponse({
"success": True,
"message": "检查未完成文件功能已就绪",
"data": f"{len(result)} 个文件未下载"
})
except Exception as e:
logger.error(f"检查未完成文件失败: {e}")
return JSONResponse({
"success": False,
"message": f"检查失败: {str(e)}"
})
if __name__ == "__main__":
uvicorn.run(
"main:app",
host=config.host,
port=config.port,
reload=config.debug
)

@ -0,0 +1,81 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
性能优化模块
"""
import asyncio
import time
from typing import Dict, Any
from functools import wraps
from logger import get_logger
logger = get_logger("performance")
def monitor_performance(func):
"""性能监控装饰器"""
@wraps(func)
async def async_wrapper(*args, **kwargs):
start_time = time.time()
try:
result = await func(*args, **kwargs)
execution_time = time.time() - start_time
logger.info(f"{func.__name__} 执行完成,耗时: {execution_time:.2f}")
return result
except Exception as e:
execution_time = time.time() - start_time
logger.error(f"{func.__name__} 执行失败,耗时: {execution_time:.2f}秒,错误: {e}")
raise
@wraps(func)
def sync_wrapper(*args, **kwargs):
start_time = time.time()
try:
result = func(*args, **kwargs)
execution_time = time.time() - start_time
logger.info(f"{func.__name__} 执行完成,耗时: {execution_time:.2f}")
return result
except Exception as e:
execution_time = time.time() - start_time
logger.error(f"{func.__name__} 执行失败,耗时: {execution_time:.2f}秒,错误: {e}")
raise
if asyncio.iscoroutinefunction(func):
return async_wrapper
else:
return sync_wrapper
class PerformanceMonitor:
"""性能监控器"""
def __init__(self):
self.metrics: Dict[str, Any] = {}
self.start_time = time.time()
def start_timer(self, name: str):
"""开始计时"""
self.metrics[name] = {"start": time.time()}
def end_timer(self, name: str):
"""结束计时"""
if name in self.metrics:
self.metrics[name]["end"] = time.time()
self.metrics[name]["duration"] = (
self.metrics[name]["end"] - self.metrics[name]["start"]
)
logger.info(f"{name} 耗时: {self.metrics[name]['duration']:.2f}")
def get_summary(self) -> Dict[str, Any]:
"""获取性能摘要"""
total_time = time.time() - self.start_time
return {
"total_time": total_time,
"metrics": self.metrics,
"summary": f"总运行时间: {total_time:.2f}"
}
# 全局性能监控器
perf_monitor = PerformanceMonitor()

@ -0,0 +1,117 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
实时日志输出模块
"""
import asyncio
import json
import time
import threading
from typing import List, Dict, Any, Optional
from pathlib import Path
import logging
logger = logging.getLogger("realtime_logger")
class RealtimeLogger:
"""实时日志记录器"""
def __init__(self):
self.connections: List[Any] = []
self.log_buffer: List[Dict[str, Any]] = []
self.max_buffer_size = 1000
self._lock = threading.Lock()
self._loop: Optional[asyncio.AbstractEventLoop] = None
def set_loop(self, loop: asyncio.AbstractEventLoop) -> None:
"""注册主事件循环,便于跨线程安全调度发送任务"""
self._loop = loop
def add_connection(self, websocket):
"""添加WebSocket连接"""
with self._lock:
self.connections.append(websocket)
logger.info(f"新增WebSocket连接,当前连接数: {len(self.connections)}")
def remove_connection(self, websocket):
"""移除WebSocket连接"""
with self._lock:
if websocket in self.connections:
self.connections.remove(websocket)
logger.info(f"移除WebSocket连接,当前连接数: {len(self.connections)}")
async def broadcast_log(self, message: str, level: str = "INFO", source: str = "system"):
"""广播日志消息到所有连接的客户端"""
log_entry = {
"timestamp": time.time(),
"time": time.strftime("%H:%M:%S"),
"level": level,
"source": source,
"message": message
}
# 添加到缓冲区
with self._lock:
self.log_buffer.append(log_entry)
if len(self.log_buffer) > self.max_buffer_size:
self.log_buffer = self.log_buffer[-self.max_buffer_size:]
# 广播到所有连接
if self.connections:
message_data = json.dumps(log_entry, ensure_ascii=False)
disconnected = []
for websocket in self.connections.copy(): # 使用副本避免并发修改
try:
await websocket.send_text(message_data)
except Exception as e:
logger.warning(f"发送消息失败: {e}")
disconnected.append(websocket)
# 清理断开的连接
for ws in disconnected:
self.remove_connection(ws)
def broadcast_log_sync(self, message: str, level: str = "INFO", source: str = "system"):
"""同步广播日志消息(用于非异步环境)"""
log_entry = {
"timestamp": time.time(),
"time": time.strftime("%H:%M:%S"),
"level": level,
"source": source,
"message": message
}
# 添加到缓冲区
with self._lock:
self.log_buffer.append(log_entry)
if len(self.log_buffer) > self.max_buffer_size:
self.log_buffer = self.log_buffer[-self.max_buffer_size:]
# 若已注册事件循环,尝试在线程安全地调度异步广播
if self._loop is not None:
try:
asyncio.run_coroutine_threadsafe(
self.broadcast_log(message=message, level=level, source=source),
self._loop,
)
except Exception:
# 忽略发送失败,缓冲区仍可用于新连接回放
pass
async def get_recent_logs(self, count: int = 50) -> List[Dict[str, Any]]:
"""获取最近的日志"""
with self._lock:
return self.log_buffer[-count:] if self.log_buffer else []
def clear_buffer(self):
"""清空日志缓冲区"""
with self._lock:
self.log_buffer.clear()
logger.info("日志缓冲区已清空")
# 全局实时日志记录器
realtime_logger = RealtimeLogger()

@ -0,0 +1,32 @@
# Web框架
fastapi==0.104.1
uvicorn[standard]==0.24.0
starlette==0.27.0
websockets==12.0
# HTTP客户端
httpx==0.25.2
httpcore==1.0.9
# 异步文件操作
aiofiles==24.1.0
# HTML解析
beautifulsoup4==4.14.0
lxml==6.0.2
soupsieve==2.8
# 模板引擎
Jinja2==3.1.6
MarkupSafe==3.0.3
# 数据验证
pydantic==2.11.9
pydantic_core==2.33.2
# 进度条
tqdm==4.67.1
# 其他依赖
python-multipart==0.0.6
certifi==2025.8.3

@ -0,0 +1,2 @@
# src package initialization
# This file makes Python treat the directory as a package

@ -0,0 +1,2 @@
# core package initialization
# Contains core business logic

@ -0,0 +1,83 @@
from __future__ import annotations
import asyncio
import json
import logging
import os
import re
import sys
from pathlib import Path
from typing import Dict, List, Optional
import httpx
from bs4 import BeautifulSoup
from tqdm.asyncio import tqdm_asyncio
from config import config
from ..logging.logger import get_logger
from ..logging.realtime_logger import realtime_logger
log = get_logger("step1")
ILLEGAL_CHARS = re.compile(r'[\\/:*?"<>|]')
def clean_folder_name(name: str) -> str:
"""清理文件夹名称中的非法字符"""
return ILLEGAL_CHARS.sub('', name).strip()
async def fetch_page(url: str, client: httpx.AsyncClient) -> Optional[str]:
"""获取页面内容"""
try:
resp = await client.get(url)
resp.raise_for_status()
return resp.text
except httpx.HTTPError as e:
log.error(f"Failed to fetch {url}: {str(e)}")
return None
async def crawl_single_gallery(gallery_url: str, client: httpx.AsyncClient, sem: asyncio.Semaphore):
"""爬取单个画廊"""
async with sem:
try:
gallery_url = gallery_url.rstrip()
base_url = gallery_url.split('/g/')[0]
folder_name = clean_folder_name(gallery_url.split('/')[-1])
folder_path = Path(config.targets_path) / folder_name
json_path = folder_path / "info.json"
if json_path.exists():
log.info(f"Skipping existing gallery: {gallery_url}")
return
os.makedirs(folder_path, exist_ok=True)
log.info(f"Processing gallery: {gallery_url}")
page_content = await fetch_page(gallery_url, client)
if not page_content:
return
soup = BeautifulSoup(page_content, 'html.parser')
json_data = {
"gallery_url": gallery_url,
"base_url": base_url,
"title": soup.select_one('h1#gn').text.strip(),
"images": []
}
# 提取图片信息
# ... (省略具体实现)
json_path.write_text(json.dumps(json_data, indent=2, ensure_ascii=False))
realtime_logger.broadcast_log_sync(f"Processed gallery: {gallery_url}", "INFO", "step1")
except Exception as e:
log.error(f"Error processing {gallery_url}: {str(e)}")
realtime_logger.broadcast_log_sync(f"Error in {gallery_url}: {str(e)}", "ERROR", "step1")
raise
async def main(proxy: Optional[str] = None):
"""主函数"""
# ... (省略具体实现)
if __name__ == "__main__":
asyncio.run(main())

@ -0,0 +1,63 @@
from __future__ import annotations
import asyncio
import json
import logging
import os
import re
import sys
from pathlib import Path
from typing import Dict, List
import aiofiles
import httpx
from tqdm.asyncio import tqdm_asyncio
from config import config
from ..logging.logger import get_logger
from ..logging.realtime_logger import realtime_logger
log = get_logger("step2")
async def download_one(url: str, client: httpx.AsyncClient, sem: asyncio.Semaphore):
"""下载单个图片"""
async with sem:
try:
resp = await client.get(url)
resp.raise_for_status()
# 提取真实图片URL和扩展名
# ... (省略具体实现)
async with aiofiles.open(final_path, 'wb') as fp:
async for chunk in resp.aiter_bytes():
await fp.write(chunk)
realtime_logger.broadcast_log_sync(f"Downloaded {url}", "INFO", "step2")
return True
except Exception as e:
log.error(f"Failed to download {url}: {str(e)}")
realtime_logger.broadcast_log_sync(f"Error downloading {url}: {str(e)}", "ERROR", "step2")
return False
async def scan_tasks(root: Path) -> List[Dict[str, str]]:
"""扫描任务目录"""
tasks = []
for json_path in root.rglob("info.json"):
try:
data = json.loads(json_path.read_text())
for url, info in data["images"].items():
tasks.append({
"url": info["real_url"],
"save_path": json_path.parent / info["filename"]
})
except Exception as e:
log.warning(f"Error reading {json_path}: {str(e)}")
return tasks
async def main(proxy: Optional[str] = None):
"""主函数"""
# ... (省略具体实现)
if __name__ == "__main__":
asyncio.run(main())

@ -0,0 +1,2 @@
# logging package initialization
# Contains logging and real-time logging functionality

@ -0,0 +1,59 @@
import logging
import sys
from pathlib import Path
from typing import Optional
import threading
from config import config
# 延迟导入,避免循环依赖
from .realtime_logger import realtime_logger
class LoggerManager:
@staticmethod
def setup_root_logger(log_file: Optional[str] = None, level: str = "INFO"):
logger = logging.getLogger()
logger.setLevel(level.upper())
console_handler = logging.StreamHandler(sys.stdout)
console_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
logger.addHandler(console_handler)
if log_file:
file_handler = logging.FileHandler(log_file)
file_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
logger.addHandler(file_handler)
@staticmethod
def get_logger(name: str, log_file: Optional[str] = None, level: str = "INFO"):
logger = logging.getLogger(name)
logger.setLevel(level.upper())
if not logger.handlers:
console_handler = logging.StreamHandler(sys.stdout)
console_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
logger.addHandler(console_handler)
if log_file:
file_handler = logging.FileHandler(log_file)
file_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
logger.addHandler(file_handler)
return logger
class WebSocketLogHandler(logging.Handler):
def __init__(self):
super().__init__()
self.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
def emit(self, record):
try:
msg = self.format(record)
realtime_logger.broadcast_log_sync(msg, record.levelname, record.name)
except Exception:
self.handleError(record)
def get_logger(name: str, log_file: Optional[str] = None, level: str = "INFO"):
return LoggerManager.get_logger(name, log_file, level)
def setup_root_logger(log_file: Optional[str] = None, level: str = "INFO"):
LoggerManager.setup_root_logger(log_file, level)

@ -0,0 +1,77 @@
import asyncio
import json
import time
import threading
from typing import List, Dict, Any, Optional
from pathlib import Path
import logging
from config import config
class RealtimeLogger:
_instance = None
_lock = threading.Lock()
def __init__(self):
self.logger = logging.getLogger("realtime_logger")
self._loop = None
self._connections: List[Dict[str, Any]] = []
self._log_buffer: List[Dict[str, Any]] = []
self._buffer_lock = threading.Lock()
self._max_buffer_size = 100
def set_loop(self, loop):
self._loop = loop
def add_connection(self, websocket):
with self._buffer_lock:
self._connections.append({"websocket": websocket, "last_active": time.time()})
def remove_connection(self, websocket):
with self._buffer_lock:
self._connections = [conn for conn in self._connections if conn["websocket"] != websocket]
async def broadcast_log(self, message: str, level: str = "INFO", source: str = "system"):
log_entry = {
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
"level": level,
"source": source,
"message": message
}
with self._buffer_lock:
self._log_buffer.append(log_entry)
if len(self._log_buffer) > self._max_buffer_size:
self._log_buffer.pop(0)
disconnected = []
for connection in self._connections:
try:
await connection["websocket"].send_text(json.dumps(log_entry))
connection["last_active"] = time.time()
except Exception as e:
self.logger.warning(f"Failed to send log to websocket: {e}")
disconnected.append(connection["websocket"])
for ws in disconnected:
self.remove_connection(ws)
def broadcast_log_sync(self, message: str, level: str = "INFO", source: str = "system"):
if self._loop is None:
self.logger.warning("Event loop not set for RealtimeLogger")
return
asyncio.run_coroutine_threadsafe(
self.broadcast_log(message, level, source),
self._loop
)
def get_recent_logs(self, count: int = 10) -> List[Dict[str, Any]]:
with self._buffer_lock:
return self._log_buffer[-count:].copy()
def clear_buffer(self):
with self._buffer_lock:
self._log_buffer.clear()
realtime_logger = RealtimeLogger()

@ -0,0 +1,2 @@
# services package initialization
# Contains service classes and components

@ -0,0 +1,43 @@
import aiofiles
import httpx
from typing import Optional
import os
from config import config
class Downloader:
def __init__(self, proxy_str: Optional[str] = None):
self.proxy = proxy_str
async def download(self, url: str, save_path: str):
os.makedirs(os.path.dirname(save_path), exist_ok=True)
async with httpx.AsyncClient(proxies=self.proxy) as client:
response = await client.get(url)
response.raise_for_status()
async with aiofiles.open(save_path, 'wb') as f:
await f.write(response.content)
def _get_filename(self, url: str, content_type: Optional[str] = None) -> str:
if not url:
raise Exception("URL cannot be empty")
filename = url.split('/')[-1]
if content_type:
ext = content_type.split('/')[-1]
if '.' not in filename:
filename = f"{filename}.{ext}"
return filename
async def download_image(self, url: str, save_dir: str):
async with httpx.AsyncClient(proxies=self.proxy) as client:
response = await client.get(url)
response.raise_for_status()
filename = self._get_filename(url, response.headers.get('content-type'))
save_path = os.path.join(save_dir, filename)
await self.download(url, save_path)
return save_path

@ -0,0 +1,2 @@
# utils package initialization
# Contains utility functions and helpers

@ -0,0 +1,57 @@
import asyncio
import time
from typing import Dict, Any
from functools import wraps
from ..logging.logger import get_logger
class PerformanceMonitor:
def __init__(self):
self.logger = get_logger("performance")
self._timers: Dict[str, Dict[str, Any]] = {}
def monitor_performance(self, func):
@wraps(func)
async def async_wrapper(*args, **kwargs):
start_time = time.time()
self.logger.info(f"Starting {func.__name__}")
try:
result = await func(*args, **kwargs)
elapsed = time.time() - start_time
self.logger.info(f"Completed {func.__name__} in {elapsed:.2f}s")
return result
except Exception as e:
self.logger.error(f"Error in {func.__name__}: {str(e)}")
raise
@wraps(func)
def sync_wrapper(*args, **kwargs):
start_time = time.time()
self.logger.info(f"Starting {func.__name__}")
try:
result = func(*args, **kwargs)
elapsed = time.time() - start_time
self.logger.info(f"Completed {func.__name__} in {elapsed:.2f}s")
return result
except Exception as e:
self.logger.error(f"Error in {func.__name__}: {str(e)}")
raise
if asyncio.iscoroutinefunction(func):
return async_wrapper
return sync_wrapper
def start_timer(self, name: str):
self._timers[name] = {"start": time.time()}
def end_timer(self, name: str) -> float:
if name not in self._timers:
raise KeyError(f"No timer with name {name}")
elapsed = time.time() - self._timers[name]["start"]
self._timers[name]["end"] = time.time()
self._timers[name]["elapsed"] = elapsed
return elapsed
def get_summary(self) -> Dict[str, float]:
return {name: data["elapsed"] for name, data in self._timers.items() if "elapsed" in data}

@ -0,0 +1,25 @@
from typing import Optional
from ..logging.logger import get_logger
from ..core.step1 import main as step1_main
from ..core.step2 import main as step2_main
log = get_logger("utils")
async def run_step1(proxy: Optional[str] = None):
try:
log.info("Running step1")
await step1_main(proxy)
log.info("Step1 completed successfully")
except Exception as e:
log.exception(f"Error in step1: {str(e)}")
raise
async def run_step2(proxy: Optional[str] = None):
try:
log.info("Running step2")
await step2_main(proxy)
log.info("Step2 completed successfully")
except Exception as e:
log.exception(f"Error in step2: {str(e)}")
raise

@ -0,0 +1,40 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
应用启动脚本
"""
import sys
import os
from pathlib import Path
# 添加项目根目录到Python路径
project_root = Path(__file__).parent
sys.path.insert(0, str(project_root))
from config import config
from logger import LoggerManager
import uvicorn
def main():
"""主函数"""
# 设置根日志记录器
LoggerManager.setup_root_logger()
# 确保数据目录存在
config._ensure_directories()
print(f"启动 {config.app_name} v{config.app_version}")
print(f"服务器地址: http://{config.host}:{config.port}")
print(f"调试模式: {'开启' if config.debug else '关闭'}")
# 启动服务器
uvicorn.run(
"main:app",
host=config.host,
port=config.port,
reload=config.debug,
log_level=config.log_level.lower()
)
if __name__ == "__main__":
main()

Binary file not shown.

After

Width:  |  Height:  |  Size: 4.2 KiB

@ -0,0 +1,300 @@
class DownloadTool {
constructor() {
this.form = document.getElementById('downloadForm');
this.output = document.getElementById('output');
this.loadUrlsBtn = document.getElementById('loadUrls');
this.urlListTextarea = document.getElementById('urlList');
this.downloadUrlBtn = document.getElementById('downloadUrl');
this.cleanFilesBtn = document.getElementById('cleanFiles');
this.downloadImageBtn = document.getElementById('downloadImage');
this.checkIncompleteBtn = document.getElementById('checkIncomplete');
this.clearOutputBtn = document.getElementById('clearOutput');
this.proxySelect = document.getElementById('proxy');
this.websocket = null;
this.isConnected = false;
this.initEvents();
this.connectWebSocket();
}
initEvents() {
// 读取URL按钮
this.loadUrlsBtn.addEventListener('click', () => {
this.loadTargetUrls();
});
// 下载URL按钮
this.downloadUrlBtn.addEventListener('click', () => {
this.downloadUrls()
});
// 下载图片按钮
this.downloadImageBtn.addEventListener('click', () => {
this.downloadImages()
});
// 检查未完成按钮
this.checkIncompleteBtn.addEventListener('click', () => {
this.checkIncomplete();
});
// 清理文件按钮
this.cleanFilesBtn.addEventListener('click', () => {
this.cleanFiles();
});
// 清除输出按钮
this.clearOutputBtn.addEventListener('click', () => {
this.clearOutput();
});
}
connectWebSocket() {
try {
const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:';
const wsUrl = `${protocol}//${window.location.host}/ws`;
this.websocket = new WebSocket(wsUrl);
this.websocket.onopen = () => {
this.isConnected = true;
this.showOutput('WebSocket连接已建立,可以接收实时日志', 'success');
console.log('WebSocket连接已建立');
};
this.websocket.onmessage = (event) => {
try {
const logEntry = JSON.parse(event.data);
this.appendRealtimeLog(logEntry);
} catch (e) {
console.error('解析WebSocket消息失败:', e);
}
};
this.websocket.onclose = () => {
this.isConnected = false;
this.showOutput('WebSocket连接已断开,正在尝试重连...', 'error');
console.log('WebSocket连接已断开');
// 5秒后尝试重连
setTimeout(() => this.connectWebSocket(), 5000);
};
this.websocket.onerror = (error) => {
console.error('WebSocket错误:', error);
this.showOutput('WebSocket连接错误', 'error');
};
} catch (error) {
console.error('创建WebSocket连接失败:', error);
this.showOutput('WebSocket连接失败', 'error');
}
}
appendRealtimeLog(logEntry) {
const timestamp = logEntry.time || new Date().toLocaleTimeString();
const level = logEntry.level || 'INFO';
const source = logEntry.source || 'system';
const message = logEntry.message || '';
const logLine = `[${timestamp}] [${level}] [${source}] ${message}`;
// 追加到输出框
if (this.output.textContent) {
this.output.textContent += '\n' + logLine;
} else {
this.output.textContent = logLine;
}
// 自动滚动到底部
this.output.scrollTop = this.output.scrollHeight;
// 根据日志级别设置样式
if (level === 'ERROR') {
this.output.classList.add('error');
} else if (level === 'SUCCESS') {
this.output.classList.add('success');
} else {
this.output.classList.remove('error', 'success');
}
}
async loadTargetUrls() {
try {
this.setLoading(true);
this.showOutput('正在读取 targets.txt...', 'info');
const response = await fetch('/load_urls', {
method: 'POST'
});
const result = await response.json();
if (result.success) {
// 在URL列表文本框中显示读取的URL
this.urlListTextarea.value = result.urls.join('\n');
this.showOutput(`成功读取 ${result.urls.length} 个URL\n\nURL列表:\n${result.urls.join('\n')}`, 'success');
} else {
this.showOutput(`读取失败: ${result.message}`, 'error');
}
} catch (error) {
this.showOutput(`读取URL时出错: ${error.message}`, 'error');
} finally {
this.setLoading(false);
}
}
async clearOutput() {
try {
const response = await fetch('/clear', {
method: 'POST'
});
const result = await response.json();
if (result.success) {
this.showOutput('', 'success');
this.urlListTextarea.value = ''; // 同时清空URL列表
}
} catch (error) {
this.showOutput(`清除失败: ${error.message}`, 'error');
}
}
async downloadUrls() {
try {
const proxy = this.proxySelect.value;
this.showOutput(`正在抓取画廊链接...\n代理: ${proxy}\n\n注意:此操作可能需要较长时间,请耐心等待...`, 'info');
// 使用setTimeout确保UI不被阻塞
setTimeout(async () => {
try {
const res = await fetch('/download_urls', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ proxy })
});
const data = await res.json();
this.showOutput(data.message, data.success ? 'success' : 'error');
} catch (error) {
this.showOutput(`抓取画廊链接时出错: ${error.message}`, 'error');
}
}, 100);
} catch (error) {
this.showOutput(`抓取画廊链接时出错: ${error.message}`, 'error');
}
}
async downloadImages() {
try {
const proxy = this.proxySelect.value;
this.showOutput(`正在下载图片...\n代理: ${proxy}\n\n注意:此操作可能需要较长时间,请耐心等待...`, 'info');
// 使用setTimeout确保UI不被阻塞
setTimeout(async () => {
try {
const res = await fetch('/download_images', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ proxy })
});
const data = await res.json();
this.showOutput(data.message, data.success ? 'success' : 'error');
} catch (error) {
this.showOutput(`下载图片时出错: ${error.message}`, 'error');
}
}, 100);
} catch (error) {
this.showOutput(`下载图片时出错: ${error.message}`, 'error');
}
}
async checkIncomplete() {
try {
this.setLoading(true);
this.showOutput('正在检查未完成文件...', 'info');
const response = await fetch('/check_incomplete', {
method: 'POST'
});
const result = await response.json();
if (result.success) {
let message = `检查完成!\n\n`;
message += `${result.data}`;
this.showOutput(message, 'success');
} else {
this.showOutput(`检查失败: ${result.message}`, 'error');
}
} catch (error) {
this.showOutput(`检查未完成文件时出错: ${error.message}`, 'error');
} finally {
this.setLoading(false);
}
}
async cleanFiles() {
try {
this.setLoading(true);
this.showOutput('正在清理日志和JSON文件...', 'info');
const response = await fetch('/clean_files', {
method: 'POST'
});
const result = await response.json();
if (result.success) {
let message = `清理完成!成功删除 ${result.deleted_count} 个文件\n\n`;
if (result.deleted_files && result.deleted_files.length > 0) {
message += "已删除的文件:\n" + result.deleted_files.join('\n');
}
this.showOutput(message, 'success');
} else {
let message = `清理完成,但有 ${result.error_count} 个文件删除失败\n\n`;
if (result.deleted_files && result.deleted_files.length > 0) {
message += "已删除的文件:\n" + result.deleted_files.join('\n') + '\n\n';
}
if (result.error_files && result.error_files.length > 0) {
message += "删除失败的文件:\n" + result.error_files.join('\n');
}
this.showOutput(message, 'error');
}
} catch (error) {
this.showOutput(`清理文件时出错: ${error.message}`, 'error');
} finally {
this.setLoading(false);
}
}
showOutput(message, type = '') {
this.output.textContent = message;
this.output.className = 'output-area';
if (type) {
this.output.classList.add(type);
}
// 自动滚动到底部
this.output.scrollTop = this.output.scrollHeight;
}
setLoading(loading) {
const buttons = this.form.querySelectorAll('button');
buttons.forEach(button => {
button.disabled = loading;
});
if (loading) {
document.body.classList.add('loading');
} else {
document.body.classList.remove('loading');
}
}
}
// 初始化应用
document.addEventListener('DOMContentLoaded', () => {
new DownloadTool();
});

@ -0,0 +1,217 @@
* {
margin: 0;
padding: 0;
box-sizing: border-box;
}
body {
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
min-height: 100vh;
padding: 20px;
}
.container {
max-width: 800px;
margin: 0 auto;
background: white;
border-radius: 10px;
padding: 30px;
box-shadow: 0 10px 30px rgba(0, 0, 0, 0.2);
}
h1 {
text-align: center;
color: #333;
margin-bottom: 30px;
font-size: 2.5em;
}
.form-group {
margin-bottom: 20px;
}
label {
display: block;
margin-bottom: 5px;
font-weight: 600;
color: #555;
}
input[type="text"],
input[type="url"],
.proxy-select {
width: 100%;
padding: 12px;
border: 2px solid #ddd;
border-radius: 5px;
font-size: 16px;
transition: border-color 0.3s;
}
input[type="text"]:focus,
input[type="url"]:focus,
.proxy-select:focus {
outline: none;
border-color: #667eea;
}
/* URL列表文本框 */
.url-list {
width: 100%;
height: 120px;
padding: 12px;
border: 2px solid #ddd;
border-radius: 5px;
font-size: 12px;
font-family: 'Courier New', monospace;
background-color: #f8f9fa;
resize: vertical;
color: #555;
}
.url-list:focus {
outline: none;
border-color: #667eea;
}
.button-group {
display: flex;
gap: 10px;
margin-bottom: 30px;
flex-wrap: wrap;
}
.btn {
padding: 12px 24px;
border: none;
border-radius: 5px;
font-size: 16px;
cursor: pointer;
transition: all 0.3s;
font-weight: 600;
}
.btn-primary {
background: #667eea;
color: white;
}
.btn-primary:hover {
background: #5a6fd8;
transform: translateY(-2px);
}
.btn-secondary {
background: #764ba2;
color: white;
}
.btn-secondary:hover {
background: #6a4190;
transform: translateY(-2px);
}
.btn-danger {
background: #e74c3c;
color: white;
}
.btn-danger:hover {
background: #c0392b;
transform: translateY(-2px);
}
.btn-info {
background: #17a2b8;
color: white;
}
.btn-info:hover {
background: #138496;
transform: translateY(-2px);
}
.btn-warning {
background: #ffc107;
color: #212529;
}
.btn-warning:hover {
background: #e0a800;
transform: translateY(-2px);
}
.btn-success {
background: #28a745;
color: white;
}
.btn-success:hover {
background: #218838;
transform: translateY(-2px);
}
.output-section h3 {
color: #333;
margin-bottom: 10px;
}
.output-area {
background: #f8f9fa;
border: 1px solid #e9ecef;
border-radius: 5px;
padding: 15px;
min-height: 150px;
max-height: 400px;
overflow-y: auto;
font-family: 'Courier New', monospace;
white-space: pre-wrap;
word-wrap: break-word;
}
.output-area.success {
border-left: 4px solid #28a745;
}
.output-area.error {
border-left: 4px solid #dc3545;
}
.loading {
opacity: 0.7;
pointer-events: none;
}
@media (max-width: 600px) {
.container {
padding: 20px;
}
.button-group {
flex-direction: column;
}
.btn {
width: 100%;
}
}
.form-row {
display: flex;
gap: 15px;
}
.form-row .form-group {
flex: 1;
}
/* 下拉框样式 */
.proxy-select {
background-color: white;
cursor: pointer;
}
.proxy-select option {
padding: 8px;
}

@ -0,0 +1,207 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
异步批量抓取 E-H 画廊图片链接按专辑保存 json
python eh_crawler.py
"""
from __future__ import annotations
import asyncio
import json
import logging
import os
import re
import sys
from pathlib import Path
from typing import Dict, List, Optional
import httpx
from bs4 import BeautifulSoup
from tqdm.asyncio import tqdm_asyncio
from pathlib import Path
# -------------------- 可配置常量 --------------------
from config import config
CONCURRENCY = config.concurrency
MAX_PAGE = config.max_page
RETRY_PER_PAGE = config.retry_per_page
TIMEOUT = httpx.Timeout(config.timeout)
IMG_SELECTOR = "#gdt" # 图片入口区域
FAILED_RECORD = "data/failed_keys.json"
LOG_LEVEL = getattr(logging, config.log_level.upper())
# ----------------------------------------------------
# 确保数据目录存在
if not os.path.exists("data"):
os.mkdir("data")
# 使用统一的日志配置
from logger import get_logger
from realtime_logger import realtime_logger
log = get_logger("step1", "crawl.log")
# 预编译正则
ILLEGAL_CHARS = re.compile(r'[<>:"/\\|?*\x00-\x1F]')
# -------------------- 工具函数 --------------------
def clean_folder_name(title: str) -> str:
"""清洗文件夹名"""
return ILLEGAL_CHARS.sub("_", title).replace(" ", "").replace("_", "").strip() or "gallery"
def load_targets() -> List[str]:
"""读取 targets.txt"""
tgt = Path("data/targets.txt")
with open(tgt, 'r', encoding='utf-8') as f:
urls = [line.strip() for line in f.readlines() if line.strip()]
lines = []
for ln in tgt.read_text(encoding="utf-8").splitlines():
url = ln.strip()
if url and not url.startswith('#'):
lines.append(url)
if not lines:
log.error("targets.txt 为空,请先填写 URL")
return
return list(set(lines)) # 去重
def load_failed() -> List[str]:
if Path(FAILED_RECORD).exists():
try:
return json.loads(Path(FAILED_RECORD).read_text(encoding="utf-8"))
except Exception as exc:
log.warning(f"加载失败记录失败 -> {exc}")
return []
def save_failed(keys: List[str]) -> None:
Path(FAILED_RECORD).write_text(json.dumps(keys, ensure_ascii=False, indent=2), encoding="utf-8")
# -------------------- 爬虫核心 --------------------
async def fetch_page(client: httpx.AsyncClient, url: str) -> Optional[str]:
"""获取单页 HTML"""
for attempt in range(1, RETRY_PER_PAGE + 1):
try:
resp = await client.get(url)
resp.raise_for_status()
return resp.text
except httpx.HTTPError as exc:
log.error(f"[{attempt}/{RETRY_PER_PAGE}] 请求失败 {url} -> {exc}")
await asyncio.sleep(2 ** attempt)
return None
async def crawl_single_gallery(
client: httpx.AsyncClient, sem: asyncio.Semaphore, gallery_url: str
) -> bool:
"""抓取单个画廊,成功返回 True"""
async with sem:
base_url = gallery_url.rstrip("/")
key = base_url.split("/")[-1] # 用最后一截当 key
json_name = f"{key}.json"
folder_path: Optional[Path] = None
json_data: Dict[str, str] = {}
img_count = 1
last_page = False
for page in range(MAX_PAGE):
if last_page:
break
url = f"{base_url}?p={page}"
html = await fetch_page(client, url)
if html is None:
continue
soup = BeautifulSoup(html, "lxml")
title = soup.title.string if soup.title else "gallery"
clean_title = clean_folder_name(title)
folder_path = Path("data/downloads") / clean_title
folder_path.mkdir(parents=True, exist_ok=True)
# 如果 json 已存在则跳过整个画廊
json_path = folder_path / json_name
if json_path.exists():
log.info(f"{json_name} 已存在,跳过")
return True
log.info(f"当前页码:{page + 1} {url}")
selected = soup.select_one(IMG_SELECTOR)
if not selected:
log.warning(f"未找到选择器 {IMG_SELECTOR}")
continue
links = re.findall(r'<a href="(.*?)"', selected.prettify())
if not links:
log.info("本页无图片入口,视为最后一页")
last_page = True
continue
for img_entry in links:
if img_entry in json_data.values():
last_page = True
break
json_data[f"{img_count:04d}"] = img_entry
img_count += 1
if json_data:
json_path.write_text(
json.dumps(json_data, ensure_ascii=False, indent=2), encoding="utf-8"
)
log.info(f"保存成功 -> {json_path} ({len(json_data)} 张)")
# 发送实时日志
try:
realtime_logger.broadcast_log_sync(f"画廊 {key} 抓取完成,共 {len(json_data)} 张图片", "SUCCESS", "step1")
except Exception as e:
log.warning(f"发送实时日志失败: {e}")
return True
else:
log.warning(f"{key} 未解析到任何图片链接")
# 发送实时日志
try:
realtime_logger.broadcast_log_sync(f"画廊 {key} 未解析到任何图片链接", "WARNING", "step1")
except Exception as e:
log.warning(f"发送实时日志失败: {e}")
return False
# -------------------- 主流程 --------------------
async def main(proxy: str | None = None) -> None:
targets = load_targets()
failed = load_failed()
if failed:
log.info(f"优先重试上次失败画廊: {len(failed)}")
all_urls = list(set(targets + failed))
print(proxy)
limits = httpx.Limits(max_keepalive_connections=20, max_connections=50)
async with httpx.AsyncClient(
limits=limits, timeout=TIMEOUT, proxies=proxy, verify=True
) as client:
sem = asyncio.Semaphore(CONCURRENCY)
results = await tqdm_asyncio.gather(
*[crawl_single_gallery(client, sem, u) for u in all_urls],
desc="Galleries",
total=len(all_urls),
)
# 失败持久化
new_failed = [u for u, ok in zip(all_urls, results) if not ok]
if new_failed:
save_failed(new_failed)
log.warning(f"本轮仍有 {len(new_failed)} 个画廊失败,已写入 {FAILED_RECORD}")
else:
Path(FAILED_RECORD).unlink(missing_ok=True)
log.info("全部画廊抓取完成!")
if __name__ == "__main__":
try:
asyncio.run(main())
except KeyboardInterrupt:
log.info("用户中断,抓取结束")

@ -0,0 +1,187 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
异步批量下载 EH 画廊真实图片
python download_images.py
"""
from __future__ import annotations
import asyncio
import json
import logging
import os
import re
import sys
from pathlib import Path
from typing import Dict, List
import aiofiles
import httpx
from pathlib import Path
from tqdm.asyncio import tqdm_asyncio
# -------------------- 可配置常量 --------------------
from config import config
CONCURRENCY = config.concurrency
RETRY_PER_IMG = config.retry_per_image
TIMEOUT = httpx.Timeout(config.image_timeout)
FAILED_RECORD = "data/failed_downloads.json"
LOG_LEVEL = getattr(logging, config.log_level.upper())
# ----------------------------------------------------
# 确保数据目录存在
if not os.path.exists("data"):
os.mkdir("data")
# 使用统一的日志配置
from logger import get_logger
from realtime_logger import realtime_logger
log = get_logger("step2", "download.log")
# 预编译正则
IMG_URL_RE = re.compile(r'<img id="img" src="(.*?)"', re.S)
EXT_RE = re.compile(r"\.(jpg|jpeg|png|gif|webp)$", re.I)
# -------------------- 工具函数 --------------------
def load_failed() -> List[Dict[str, str]]:
if Path(FAILED_RECORD).exists():
try:
return json.loads(Path(FAILED_RECORD).read_text(encoding="utf-8"))
except Exception as exc:
log.warning(f"加载失败记录失败 -> {exc}")
return []
def save_failed(failed: List[Dict[str, str]]) -> None:
Path(FAILED_RECORD).write_text(json.dumps(failed, ensure_ascii=False, indent=2), encoding="utf-8")
# -------------------- 下载核心 --------------------
async def download_one(
client: httpx.AsyncClient, sem: asyncio.Semaphore, item: Dict[str, str]
) -> bool:
"""下载单张图,成功返回 True"""
img_path, img_url = Path(item["img_path"]), item["img_url"]
await sem.acquire()
try:
for attempt in range(1, RETRY_PER_IMG + 1):
try:
# 1. 获取详情页
resp = await client.get(img_url)
resp.raise_for_status()
real_url_match = IMG_URL_RE.search(resp.text)
if not real_url_match:
log.warning(f"未解析到真实图片链接: {img_url}")
return False # <- 这里不会触发 await
real_url = real_url_match.group(1)
# 2. 下载真实图片(流式)
ext_match = EXT_RE.search(real_url)
ext = ext_match.group(1).lower() if ext_match else "jpg"
final_path = img_path.with_suffix(f".{ext}")
if final_path.exists():
log.info(f"已存在,跳过: {final_path.name}")
return True
async with client.stream("GET", real_url) as img_resp:
img_resp.raise_for_status()
final_path.parent.mkdir(parents=True, exist_ok=True)
async with aiofiles.open(final_path, "wb") as fp:
async for chunk in img_resp.aiter_bytes(chunk_size=65536):
await fp.write(chunk)
# log.info(f"[OK] {final_path.name}")
# 发送实时日志
try:
realtime_logger.broadcast_log_sync(f"下载完成: {final_path.name}", "SUCCESS", "step2")
except Exception as e:
log.warning(f"发送实时日志失败: {e}")
return True
except httpx.HTTPStatusError as exc:
if exc.response.status_code == 429:
wait = 2 ** (attempt - 1)
log.warning(f"[429] 等待 {wait}s 后重试({attempt}/{RETRY_PER_IMG}")
await asyncio.sleep(wait)
else:
log.error(f"[HTTP {exc.response.status_code}] {img_url}")
break
except Exception as exc:
log.error(f"[ERROR] {img_url} -> {exc}{attempt}/{RETRY_PER_IMG}")
await asyncio.sleep(1)
return False
finally:
sem.release()
# -------------------- 扫描待下载列表 --------------------
async def scan_tasks() -> List[Dict[str, str]]:
"""扫描 downloads/ 下所有 json,返回待下载列表"""
result = []
root = Path("data/downloads")
if not root.exists():
return result
for json_path in root.rglob("*.json"):
folder = json_path.parent
try:
data: Dict[str, str] = json.loads(json_path.read_text(encoding="utf-8"))
except Exception as exc:
log.warning(f"读取 json 失败 {json_path} -> {exc}")
continue
for img_name, img_url in data.items():
img_path = folder / img_name # 无后缀
# 判断任意后缀是否存在
exists = False
for ext in (".jpg", ".jpeg", ".png", ".gif", ".webp"):
if img_path.with_suffix(ext).exists():
exists = True
break
if not exists:
result.append({"img_path": str(img_path), "img_url": img_url})
return result
# -------------------- 主流程 --------------------
async def main(proxy: str | None = None) -> None:
# 1. 优先重试上次失败
failed_tasks = load_failed()
if failed_tasks:
log.info(f"优先重试上次失败任务: {len(failed_tasks)}")
tasks = failed_tasks + await scan_tasks()
if not tasks:
log.info("没有需要下载的图片,收工!")
return
limits = httpx.Limits(max_keepalive_connections=20, max_connections=50)
async with httpx.AsyncClient(limits=limits, timeout=TIMEOUT, proxies=proxy, verify=True) as client:
sem = asyncio.Semaphore(CONCURRENCY)
results = await tqdm_asyncio.gather(
*[download_one(client, sem, t) for t in tasks],
desc="Downloading",
total=len(tasks),
)
# 统计 & 持久化新失败
failed_again = [t for t, ok in zip(tasks, results) if not ok]
if failed_again:
save_failed(failed_again)
log.warning(f"本轮仍有 {len(failed_again)} 张下载失败,已写入 {FAILED_RECORD}")
else:
Path(FAILED_RECORD).unlink(missing_ok=True)
log.info("全部下载完成!")
if __name__ == "__main__":
try:
asyncio.run(main())
except KeyboardInterrupt:
log.info("用户中断,下载结束")

@ -0,0 +1,75 @@
<!DOCTYPE html>
<html lang="zh-CN">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>EH-Downloader</title>
<link rel="stylesheet" href="/static/style.css">
</head>
<body>
<div class="container">
<h1>EH</h1>
<form id="downloadForm" class="form">
<div class="form-group">
<label for="proxy">代理设置:</label>
<select id="proxy" name="proxy" class="proxy-select" required>
{% for proxy in proxies %}
<option value="{{ proxy }}" {% if proxy == default_proxy %}selected{% endif %}>{{ proxy }}</option>
{% endfor %}
</select>
</div>
<div class="form-group">
<button type="button" id="loadUrls" class="btn btn-info">读取目标URL</button>
</div>
<div class="form-group">
<label for="urlList">目标URL列表:</label>
<textarea id="urlList" name="urlList" class="url-list" readonly placeholder="这里将显示从 targets.txt 读取的URL列表"></textarea>
</div>
<div class="button-group">
<button type="button" id="downloadUrl" class="btn btn-primary">下载URL</button>
<button type="button" id="downloadImage" class="btn btn-secondary">下载图片</button>
<button type="button" id="checkIncomplete" class="btn btn-info">检查未完成</button>
<button type="button" id="cleanFiles" class="btn btn-warning">清理日志和JSON文件</button>
<button type="button" id="clearOutput" class="btn btn-danger">清除输出</button>
</div>
</form>
<div class="output-section">
<h3>操作日志</h3>
<pre id="output" class="output-area"></pre>
</div>
<br><br>
<div class="output-section">
<h3>使用说明</h3>
<div class="usage-instructions">
<p><strong>工具使用步骤:</strong></p>
<ol>
<li>从下拉框选择代理设置(代理配置保存在项目根目录的proxy.txt中)</li>
<li>将URL复制到项目目录下的data/targets.txt中,一个画廊一个URL</li>
<li><a href="https://e-hentai.org/" target="_blank">这里</a>获取需要下载的画廊URL</li>
<li>点击"读取目标URL"加载 targets.txt 中的URL列表</li>
<li>点击"下载URL"开始抓取画廊链接</li>
<li>点击"下载图片"开始下载图片文件</li>
<li>使用"检查未完成"查看下载进度</li>
<li>使用"清理日志和JSON文件"清理临时文件</li>
</ol>
<p><strong>注意事项:</strong></p>
<ul>
<li>请确保代理设置正确,且 targets.txt 文件中已添加目标URL</li>
<li>代理配置:在项目根目录的proxy.txt文件中,每行一个代理地址,格式为 IP:端口</li>
<li>下载的图片会保存在 data/downloads 目录下,按画廊名称分文件夹存储</li>
<li>如果下载中断,可以重新运行"下载图片"继续未完成的下载</li>
</ul>
</div>
</div>
</div>
<script src="/static/script.js"></script>
</body>
</html>

@ -0,0 +1,35 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
工具函数模块
"""
from typing import Optional
from logger import get_logger
from step1 import main as step1_main
from step2 import main as step2_main
# 设置日志
logger = get_logger("utils")
async def run_step1(proxy: Optional[str] = None) -> str:
"""执行第一步:抓取画廊链接"""
try:
logger.info("开始执行画廊链接抓取")
await step1_main(proxy)
logger.info("画廊链接抓取完成")
return "画廊链接抓取完成!"
except Exception as e:
logger.exception("step1 执行失败")
return f"抓取失败:{e}"
async def run_step2(proxy: Optional[str] = None) -> str:
"""执行第二步:下载图片"""
try:
logger.info("开始执行图片下载")
await step2_main(proxy)
logger.info("图片下载完成")
return "图片下载完成!"
except Exception as e:
logger.exception("step2 执行失败")
return f"下载失败:{e}"
Loading…
Cancel
Save