From 68c1c143a4d47efa4c072f8deb029353d32f897f Mon Sep 17 00:00:00 2001 From: Jack Date: Sun, 23 Nov 2025 19:56:44 +0800 Subject: [PATCH] update --- main.py | 575 ++++++++---------------------------------------- post_eh_data.js | 25 +-- 2 files changed, 107 insertions(+), 493 deletions(-) diff --git a/main.py b/main.py index 1eb6087..eda065b 100644 --- a/main.py +++ b/main.py @@ -26,8 +26,8 @@ logger = logging.getLogger(__name__) DOWNLOADS_DIR = "downloads" MAX_FILENAME_LENGTH = 100 INVALID_FILENAME_CHARS = '<>:"/\\|?*' -MAX_CONCURRENT_DOWNLOADS = 5 # 最大并发下载数 -DOWNLOAD_TIMEOUT = 30 # 下载超时时间(秒) +MAX_CONCURRENT_DOWNLOADS = 5 +DOWNLOAD_TIMEOUT = 30 # FastAPI应用 app = FastAPI(title="eh-v2") @@ -57,39 +57,29 @@ class DownloadStatusResponse(BaseModel): # 工具函数 def setup_downloads_directory() -> Path: - """创建并返回下载目录路径""" downloads_path = Path(DOWNLOADS_DIR) downloads_path.mkdir(exist_ok=True) - logger.info(f"下载目录已准备: {downloads_path.absolute()}") return downloads_path def sanitize_filename(filename: str) -> str: - """清理文件名,移除非法字符并限制长度""" sanitized = filename for char in INVALID_FILENAME_CHARS: sanitized = sanitized.replace(char, '_') - - # 限制文件名长度 if len(sanitized) > MAX_FILENAME_LENGTH: sanitized = sanitized[:MAX_FILENAME_LENGTH] - return sanitized def create_title_directory(base_path: Path, title: str) -> Path: - """创建标题对应的目录""" safe_title = sanitize_filename(title) title_dir = base_path / safe_title title_dir.mkdir(exist_ok=True) - logger.info(f"创建标题目录: {title_dir}") return title_dir async def save_data_to_file(file_path: Path, data: Dict[str, Any]) -> None: - """异步保存数据到JSON文件""" async with aiofiles.open(file_path, 'w', encoding='utf-8') as f: await f.write(json.dumps(data, ensure_ascii=False, indent=2)) def get_all_galleries() -> List[GalleryInfo]: - """获取所有画廊信息""" galleries = [] downloads_path = Path(DOWNLOADS_DIR) @@ -104,7 +94,6 @@ def get_all_galleries() -> List[GalleryInfo]: with open(data_file, 'r', encoding='utf-8') as f: data = json.load(f) - # 计算已下载的图片数量 downloaded_count = 0 if 'all_images' in data: for filename, url in data['all_images'].items(): @@ -124,38 +113,32 @@ def get_all_galleries() -> List[GalleryInfo]: return galleries async def download_single_image(client: httpx.AsyncClient, url: str, file_path: Path, semaphore: asyncio.Semaphore) -> bool: - """下载单张图片 - 精简版""" async with semaphore: try: - logger.info(f"开始下载: {url}") - - if file_path.exists(): - logger.info(f"文件已存在: {file_path}") - return True - - # 第一步:获取中间页面 + # 先获取图片后缀 response = await client.get(url, timeout=DOWNLOAD_TIMEOUT) response.raise_for_status() - # 第二步:提取真实图片URL import re match = re.search(r'img id="img" src="(.*?)"', response.text) if not match: - logger.error(f"无法提取图片URL: {url}") return False real_img_url = match.group(1) - logger.info(f"真实URL: {real_img_url}") + suffix = real_img_url.split('.')[-1] + + # 创建带后缀的文件路径 + file_path_with_suffix = file_path.with_suffix('.' + suffix) + + if file_path_with_suffix.exists(): + return True - # 第三步:下载图片 img_response = await client.get(real_img_url, timeout=DOWNLOAD_TIMEOUT) img_response.raise_for_status() - # 保存图片 - async with aiofiles.open(file_path, 'wb') as f: + async with aiofiles.open(file_path_with_suffix, 'wb') as f: await f.write(img_response.content) - logger.info(f"下载完成: {file_path}") return True except Exception as e: @@ -163,7 +146,6 @@ async def download_single_image(client: httpx.AsyncClient, url: str, file_path: return False async def download_gallery_images(title: str) -> DownloadStatusResponse: - """下载指定画廊的所有图片""" safe_title = sanitize_filename(title) gallery_path = downloads_path / safe_title data_file = gallery_path / "data.json" @@ -178,7 +160,6 @@ async def download_gallery_images(title: str) -> DownloadStatusResponse: ) try: - # 读取画廊数据 async with aiofiles.open(data_file, 'r', encoding='utf-8') as f: content = await f.read() data = json.loads(content) @@ -195,7 +176,6 @@ async def download_gallery_images(title: str) -> DownloadStatusResponse: current_progress=0.0 ) - # 初始化下载状态 download_status[title] = { "downloaded": 0, "total": total_images, @@ -204,10 +184,8 @@ async def download_gallery_images(title: str) -> DownloadStatusResponse: logger.info(f"开始下载画廊 '{title}',共 {total_images} 张图片") - # 创建信号量限制并发数 semaphore = asyncio.Semaphore(MAX_CONCURRENT_DOWNLOADS) - # 使用异步HTTP客户端 async with httpx.AsyncClient( headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' @@ -215,12 +193,10 @@ async def download_gallery_images(title: str) -> DownloadStatusResponse: follow_redirects=True ) as client: - # 准备下载任务 tasks = [] for filename, url in all_images.items(): image_path = gallery_path / filename - # 如果图片已存在,跳过下载但计入完成数量 if image_path.exists(): download_status[title]["downloaded"] += 1 continue @@ -228,26 +204,20 @@ async def download_gallery_images(title: str) -> DownloadStatusResponse: task = download_single_image(client, url, image_path, semaphore) tasks.append(task) - # 批量执行下载任务 if tasks: results = await asyncio.gather(*tasks, return_exceptions=True) - - # 统计成功下载的数量 successful_downloads = sum(1 for result in results if result is True) download_status[title]["downloaded"] += successful_downloads - # 更新最终状态 downloaded_count = download_status[title]["downloaded"] progress = (downloaded_count / total_images) * 100 if downloaded_count == total_images: download_status[title]["status"] = "completed" message = f"下载完成!共下载 {downloaded_count}/{total_images} 张图片" - logger.info(f"画廊 '{title}' {message}") else: download_status[title]["status"] = "partial" message = f"部分完成!下载 {downloaded_count}/{total_images} 张图片" - logger.warning(f"画廊 '{title}' {message}") return DownloadStatusResponse( status="success", @@ -272,28 +242,25 @@ async def download_gallery_images(title: str) -> DownloadStatusResponse: ) async def download_all_pending_galleries(): - """下载所有未完成的画廊""" galleries = get_all_galleries() pending_galleries = [g for g in galleries if g.downloaded_images < g.total_images] + logger.info(f"找到 {len(pending_galleries)} 个待下载画廊") + if not pending_galleries: logger.info("没有待下载的画廊") return - logger.info(f"开始批量下载 {len(pending_galleries)} 个画廊") - for gallery in pending_galleries: - if gallery.downloaded_images < gallery.total_images: - logger.info(f"开始下载画廊: {gallery.title}") - result = await download_gallery_images(gallery.title) - - if result.status == "success": - logger.info(f"画廊 '{gallery.title}' 下载完成: {result.message}") - else: - logger.error(f"画廊 '{gallery.title}' 下载失败: {result.message}") - - # 添加延迟避免请求过于频繁 - await asyncio.sleep(1) + logger.info(f"开始下载画廊: {gallery.title}") + result = await download_gallery_images(gallery.title) + + if result.status == "success": + logger.info(f"画廊 '{gallery.title}' 下载完成: {result.message}") + else: + logger.error(f"画廊 '{gallery.title}' 下载失败: {result.message}") + + await asyncio.sleep(1) logger.info("批量下载任务完成") @@ -301,9 +268,35 @@ async def download_all_pending_galleries(): downloads_path = setup_downloads_directory() # API路由 +@app.post("/save_url") +@app.options("/save_url") +async def save_url_data(request: SaveDataRequest = None): + if not request: + return {"status": "ok"} + + try: + title_dir = create_title_directory(downloads_path, request.title) + data_file = title_dir / "data.json" + await save_data_to_file(data_file, { + "url": request.url, + "title": request.title, + "all_images": request.all_images, + "total_images": request.total_images + }) + + logger.info(f"成功保存数据: {request.title}") + return { + "status": "success", + "message": f"数据保存成功,共 {request.total_images} 张图片", + "path": str(title_dir) + } + + except Exception as e: + logger.error(f"保存数据失败: {e}") + raise HTTPException(status_code=500, detail=f"保存失败: {str(e)}") + @app.get("/", response_class=HTMLResponse) async def read_gallery_manager(): - """画廊管理页面""" return """ @@ -312,193 +305,24 @@ async def read_gallery_manager(): 画廊下载管理器 @@ -508,25 +332,9 @@ async def read_gallery_manager():

管理您的画廊下载任务

-
- 总计: 0 个画廊 - 待下载: 0 - 已完成: 0 -
-
- - - - + +
@@ -737,92 +420,28 @@ async def read_gallery_manager(): @app.get("/api/galleries") async def get_galleries(): - """获取所有画廊信息(包括已完成和未完成的)""" galleries = get_all_galleries() return galleries -@app.post("/api/download/{title}") -async def download_gallery(title: str, background_tasks: BackgroundTasks): - """开始下载指定画廊的图片""" - try: - # 使用后台任务执行下载,避免阻塞请求 - background_tasks.add_task(download_gallery_images, title) - - return { - "status": "success", - "message": f"开始下载画廊: {title}", - "title": title - } - except Exception as e: - raise HTTPException(status_code=500, detail=f"下载失败: {str(e)}") - @app.post("/api/download/all") async def download_all_galleries(background_tasks: BackgroundTasks): - """开始下载所有未完成的画廊""" - try: - # 使用后台任务执行批量下载 - background_tasks.add_task(download_all_pending_galleries) - - return { - "status": "success", - "message": "开始批量下载所有未完成的画廊" - } - except Exception as e: - raise HTTPException(status_code=500, detail=f"批量下载失败: {str(e)}") - -@app.get("/api/download/status/{title}") -async def get_download_status(title: str): - """获取指定画廊的下载状态""" - status = download_status.get(title, {}) - return status + background_tasks.add_task(download_all_pending_galleries) + return { + "status": "success", + "message": "开始批量下载所有未完成的画廊" + } -@app.delete("/api/cleanup") -async def cleanup_json_files(): - """删除所有JSON文件(保留图片)""" - try: - deleted_count = 0 - downloads_path = Path(DOWNLOADS_DIR) - - for gallery_dir in downloads_path.iterdir(): - if gallery_dir.is_dir(): - data_file = gallery_dir / "data.json" - if data_file.exists(): - data_file.unlink() - deleted_count += 1 - - return { - "status": "success", - "message": f"已删除 {deleted_count} 个JSON文件", - "deleted_count": deleted_count - } - except Exception as e: - raise HTTPException(status_code=500, detail=f"清理失败: {str(e)}") - -@app.delete("/api/galleries/{title}") -async def delete_gallery(title: str): - """删除指定画廊的所有文件""" - try: - safe_title = sanitize_filename(title) - gallery_path = downloads_path / safe_title - - if gallery_path.exists(): - # 删除整个画廊目录 - import shutil - shutil.rmtree(gallery_path) - # 清除下载状态 - download_status.pop(title, None) - return { - "status": "success", - "message": f"已删除画廊: {title}" - } - else: - raise HTTPException(status_code=404, detail="画廊不存在") - except Exception as e: - raise HTTPException(status_code=500, detail=f"删除失败: {str(e)}") +@app.post("/api/download/{title}") +async def download_gallery(title: str, background_tasks: BackgroundTasks): + background_tasks.add_task(download_gallery_images, title) + return { + "status": "success", + "message": f"开始下载画廊: {title}", + "title": title + } @app.get("/health") async def health_check(): - """健康检查端点""" return {"status": "healthy"} if __name__ == "__main__": diff --git a/post_eh_data.js b/post_eh_data.js index de5d150..f3bb539 100644 --- a/post_eh_data.js +++ b/post_eh_data.js @@ -14,7 +14,7 @@ // 全局配置 - 请根据实际情况修改这些值 const BACKEND_IP = '127.0.0.1'; const BACKEND_PORT = '5100'; - const BUTTON_LOCATION_SELECTOR = 'body'; + const BUTTON_LOCATION_SELECTOR = '#gd5 > p:nth-child(5)'; const DATA_LIST_SELECTOR = '#gdt a'; // 修改为a标签的选择器 const ALL_IMG_DATA = {}; // 用于储存每一页的图片url, 格式为 {"0001": "https://example001.jpg", "0002": "https://example002.jpg"}, 最高支持4位数至9999 @@ -72,8 +72,9 @@ // 发送数据到后端的函数 function sendDataToBackend(data) { console.log('准备发送的数据:', data); - console.log('后端地址:', `http://${BACKEND_IP}:${BACKEND_PORT}/save_url`); - + console.log('数据类型:', typeof data); + console.log('字符串化后的数据:', JSON.stringify(data)); + return new Promise((resolve, reject) => { GM_xmlhttpRequest({ method: "POST", @@ -85,7 +86,6 @@ onload: function(response) { console.log('后端响应状态:', response.status); console.log('后端响应内容:', response.responseText); - console.log('响应头:', response.responseHeaders); if (response.status === 200) { resolve(response); } else { @@ -93,12 +93,7 @@ } }, onerror: function(error) { - console.error('请求错误详情:', error); reject(error); - }, - ontimeout: function() { - console.error('请求超时'); - reject(new Error('请求超时')); } }); }); @@ -163,7 +158,7 @@ // 从响应中提取图片链接 const pageImages = extractImagesFromPage(response.responseText); - + if (pageImages.length === 0) { console.log(`第${page}页没有图片,可能是最后一页`); return false; // 没有图片,可能是最后一页 @@ -182,7 +177,7 @@ }); console.log(`第${page}页采集完成,获取到${pageImages.length}个图片链接,新增${hasNewImage ? '有新图片' : '全是重复图片'}`); - + return hasNewImage; // 返回是否有新图片 } catch (error) { @@ -195,15 +190,15 @@ let shouldContinue = true; for (let page = 0; page <= 100; page++) { if (!shouldContinue) break; - + const hasNewImages = await processPage(page); - + // 如果没有新图片,说明可能是最后一页了 if (!hasNewImages && page > 0) { console.log(`第${page}页没有新图片,可能已到最后一页,停止采集`); shouldContinue = false; } - + // 如果图片数量达到上限也停止 if (img_count > 2200) { console.log('图片数量达到上限2200,停止采集'); @@ -222,7 +217,7 @@ // 显示结果并发送到后端 console.log('采集完成的所有数据:', data); console.log('后端地址:', BACKEND_IP + ':' + BACKEND_PORT); - + try { await sendDataToBackend(data); alert(`数据采集完成并已保存到后端!\n标题: ${pageTitle}\n总图片链接数量: ${Object.keys(ALL_IMG_DATA).length}`);