From f9bf9826b4e21a98f73488c7923aa862be5e0316 Mon Sep 17 00:00:00 2001 From: Jack Date: Sat, 22 Nov 2025 22:16:02 +0800 Subject: [PATCH] update --- downloader.py | 73 ------- main.py | 560 +++++++++++++++++++++++++++++++++++++++++++++--- post_eh_data.js | 361 +++++++++++++++++-------------- 3 files changed, 721 insertions(+), 273 deletions(-) delete mode 100644 downloader.py diff --git a/downloader.py b/downloader.py deleted file mode 100644 index 48932d2..0000000 --- a/downloader.py +++ /dev/null @@ -1,73 +0,0 @@ -from fastapi import APIRouter, BackgroundTasks -from pydantic import BaseModel -import uuid -import os -from pathlib import Path - -router = APIRouter(prefix="/api/v1", tags=["downloader"]) - -# 存储任务状态 -tasks = {} - -class CrawlRequest(BaseModel): - url: str - cookies: str - timestamp: str - -class TaskStatus(BaseModel): - status: str # 'running', 'completed', 'failed' - result: dict = None - error: str = None - -@router.post("/start-crawl") -async def start_crawl(request: CrawlRequest, background_tasks: BackgroundTasks): - task_id = str(uuid.uuid4()) - tasks[task_id] = {'status': 'running', 'result': None, 'error': None} - - # 在后台运行爬虫任务 - background_tasks.add_task(run_crawler, task_id, request) - - return {"task_id": task_id, "status": "started"} - -@router.get("/task-status/{task_id}") -async def get_task_status(task_id: str): - task = tasks.get(task_id) - if not task: - return {"status": "not_found"} - return task - -async def run_crawler(task_id: str, request: CrawlRequest): - try: - # 这里执行您的爬虫逻辑,模拟长时间运行 - # 例如:time.sleep(300) # 5分钟 - - # 确保 downloads 目录存在(双重保障) - downloads_dir = Path("downloads") - downloads_dir.mkdir(exist_ok=True) - - # 模拟下载文件到 downloads 目录 - filename = f"download_{task_id}.txt" - filepath = downloads_dir / filename - - with open(filepath, 'w', encoding='utf-8') as f: - f.write(f"URL: {request.url}\n") - f.write(f"Cookies: {request.cookies}\n") - f.write(f"Timestamp: {request.timestamp}\n") - f.write("Download completed successfully\n") - - # 爬虫完成后更新状态 - tasks[task_id] = { - 'status': 'completed', - 'result': { - 'message': '爬虫完成', - 'data': '您的爬虫结果', - 'download_path': str(filepath) - }, - 'error': None - } - except Exception as e: - tasks[task_id] = { - 'status': 'failed', - 'result': None, - 'error': str(e) - } diff --git a/main.py b/main.py index acf47f5..4b9f0b0 100644 --- a/main.py +++ b/main.py @@ -1,43 +1,535 @@ -from fastapi import FastAPI -from contextlib import asynccontextmanager -import uvicorn +# main.py +import os +import json +import logging from pathlib import Path -from downloader import router as downloader_router - -# 检查并创建 downloads 目录 -def ensure_downloads_dir(): - downloads_dir = Path("downloads") - downloads_dir.mkdir(exist_ok=True) - print(f"确保 downloads 目录存在: {downloads_dir.absolute()}") - -# lifespan 事件处理器 -@asynccontextmanager -async def lifespan(app: FastAPI): - # 启动时执行 - ensure_downloads_dir() - print("应用启动完成!") - yield - # 关闭时执行(可选) - print("应用正在关闭...") - -app = FastAPI( - title="下载器API", - description="一个基于FastAPI的异步下载器服务", - version="1.0.0", - lifespan=lifespan +from typing import Dict, Any, List + +import aiofiles +from fastapi import FastAPI, HTTPException +from fastapi.middleware.cors import CORSMiddleware +from fastapi.responses import HTMLResponse +from fastapi.staticfiles import StaticFiles +from pydantic import BaseModel +import uvicorn + +# 配置日志 +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' ) +logger = logging.getLogger(__name__) + +# 常量定义 +DOWNLOADS_DIR = "downloads" +MAX_FILENAME_LENGTH = 100 +INVALID_FILENAME_CHARS = '<>:"/\\|?*' + +# FastAPI应用 +app = FastAPI(title="eh-v2") + +# 数据模型 +class SaveDataRequest(BaseModel): + url: str + title: str + all_images: Dict[str, str] + total_images: int + +class GalleryInfo(BaseModel): + title: str + path: str + total_images: int + downloaded_images: int + +# 工具函数 +def setup_downloads_directory() -> Path: + """创建并返回下载目录路径""" + downloads_path = Path(DOWNLOADS_DIR) + downloads_path.mkdir(exist_ok=True) + logger.info(f"下载目录已准备: {downloads_path.absolute()}") + return downloads_path + +def sanitize_filename(filename: str) -> str: + """清理文件名,移除非法字符并限制长度""" + sanitized = filename + for char in INVALID_FILENAME_CHARS: + sanitized = sanitized.replace(char, '_') + + # 限制文件名长度 + if len(sanitized) > MAX_FILENAME_LENGTH: + sanitized = sanitized[:MAX_FILENAME_LENGTH] + + return sanitized + +def create_title_directory(base_path: Path, title: str) -> Path: + """创建标题对应的目录""" + safe_title = sanitize_filename(title) + title_dir = base_path / safe_title + title_dir.mkdir(exist_ok=True) + logger.info(f"创建标题目录: {title_dir}") + return title_dir + +async def save_data_to_file(file_path: Path, data: Dict[str, Any]) -> None: + """异步保存数据到JSON文件""" + async with aiofiles.open(file_path, 'w', encoding='utf-8') as f: + await f.write(json.dumps(data, ensure_ascii=False, indent=2)) + +def get_all_galleries() -> List[GalleryInfo]: + """获取所有画廊信息""" + galleries = [] + downloads_path = Path(DOWNLOADS_DIR) + + if not downloads_path.exists(): + return galleries + + for gallery_dir in downloads_path.iterdir(): + if gallery_dir.is_dir(): + data_file = gallery_dir / "data.json" + if data_file.exists(): + try: + with open(data_file, 'r', encoding='utf-8') as f: + data = json.load(f) + + # 计算已下载的图片数量 + downloaded_count = 0 + if 'all_images' in data: + for filename, url in data['all_images'].items(): + image_path = gallery_dir / filename + if image_path.exists(): + downloaded_count += 1 + + galleries.append(GalleryInfo( + title=data.get('title', gallery_dir.name), + path=str(gallery_dir), + total_images=data.get('total_images', 0), + downloaded_images=downloaded_count + )) + except Exception as e: + logger.error(f"读取画廊数据失败 {gallery_dir}: {e}") + + return galleries + +# 初始化 +downloads_path = setup_downloads_directory() + +# API路由 +@app.post("/save_url") +async def save_url(data: SaveDataRequest): + """保存URL数据到文件系统""" + try: + logger.info("收到保存数据请求") + logger.info(f"标题: {data.title}, URL: {data.url}, 图片数量: {data.total_images}") + + # 创建标题目录 + title_dir = create_title_directory(downloads_path, data.title) + + # 数据文件路径 + data_file = title_dir / "data.json" + + # 异步保存数据 + await save_data_to_file(data_file, data.dict()) + + logger.info(f"数据已保存到: {data_file}") + + return { + "status": "success", + "message": "数据保存成功", + "file_path": str(data_file), + "title": data.title, + "total_images": data.total_images + } + + except Exception as e: + error_msg = f"保存数据时出错: {str(e)}" + logger.error(error_msg) + logger.exception("详细错误信息:") + raise HTTPException(status_code=500, detail=error_msg) + +@app.get("/", response_class=HTMLResponse) +async def read_gallery_manager(): + """画廊管理页面""" + return """ + + + + + + 画廊下载管理器 + + + +
+
+

🎨 画廊下载管理器

+

管理您的画廊下载任务

+
+ +
+ + + +
+ + +
+ + + + + """ + +@app.get("/api/galleries") +async def get_galleries(): + """获取所有画廊信息""" + galleries = get_all_galleries() + return galleries + +@app.post("/api/download/{title}") +async def download_gallery(title: str): + """开始下载指定画廊的图片""" + try: + # 这里实现图片下载逻辑 + # 遍历 all_images 字典,下载每个图片 + return { + "status": "success", + "message": f"开始下载画廊: {title}", + "title": title + } + except Exception as e: + raise HTTPException(status_code=500, detail=f"下载失败: {str(e)}") + +@app.delete("/api/cleanup") +async def cleanup_json_files(): + """删除所有JSON文件(保留图片)""" + try: + deleted_count = 0 + downloads_path = Path(DOWNLOADS_DIR) + + for gallery_dir in downloads_path.iterdir(): + if gallery_dir.is_dir(): + data_file = gallery_dir / "data.json" + if data_file.exists(): + data_file.unlink() + deleted_count += 1 + + return { + "status": "success", + "message": f"已删除 {deleted_count} 个JSON文件", + "deleted_count": deleted_count + } + except Exception as e: + raise HTTPException(status_code=500, detail=f"清理失败: {str(e)}") -# 注册路由 -app.include_router(downloader_router) +@app.delete("/api/galleries/{title}") +async def delete_gallery(title: str): + """删除指定画廊的所有文件""" + try: + safe_title = sanitize_filename(title) + gallery_path = downloads_path / safe_title + + if gallery_path.exists(): + # 删除整个画廊目录 + import shutil + shutil.rmtree(gallery_path) + return { + "status": "success", + "message": f"已删除画廊: {title}" + } + else: + raise HTTPException(status_code=404, detail="画廊不存在") + except Exception as e: + raise HTTPException(status_code=500, detail=f"删除失败: {str(e)}") -@app.get("/") -async def root(): - return {"message": "下载器服务运行中", "status": "healthy"} +@app.get("/health") +async def health_check(): + """健康检查端点""" + return {"status": "healthy"} if __name__ == "__main__": uvicorn.run( "main:app", - host="0.0.0.0", + host="0.0.0.0", port=5100, - reload=True # 开发时自动重载 - ) + reload=True + ) \ No newline at end of file diff --git a/post_eh_data.js b/post_eh_data.js index 0173e1a..b059167 100644 --- a/post_eh_data.js +++ b/post_eh_data.js @@ -1,200 +1,229 @@ // ==UserScript== -// @name 数据发送工具 +// @name eh-v2 // @namespace http://tampermonkey.net/ -// @version 1.0 -// @description 向本地后端发送当前页面的URL和Cookies -// @author You +// @version 0.1 +// @description 采集页面数据并发送到后端 +// @author Jack // @match *://*/* // @grant GM_xmlhttpRequest -// @connect 127.0.0.1 -// @connect localhost // ==/UserScript== (function() { 'use strict'; - // 配置:您可以修改这些变量来自定义行为 - const TARGET_SELECTOR = 'body'; // 按钮插入位置的选择器 - const BACKEND_IP = '127.0.0.1'; // 后端IP地址 - const BACKEND_PORT = '5100'; // 后端端口号 - - // 构建后端基础URL - const BACKEND_BASE_URL = `http://${BACKEND_IP}:${BACKEND_PORT}`; - - function addButton() { - if (document.getElementById('data-sender-button')) { - return; - } - - const button = document.createElement('button'); - button.id = 'data-sender-button'; - button.textContent = "send data"; - button.style.position = "fixed"; - button.style.top = "12.5%"; - button.style.right = "1%"; - button.style.transform = "translateY(-50%)"; - button.style.padding = "3px 8px"; - button.style.fontSize = "10px"; - button.style.backgroundColor = "#007baf"; - button.style.color = "#fff"; - button.style.border = "none"; - button.style.borderRadius = "5px"; - button.style.cursor = "pointer"; - button.style.zIndex = "10000"; - - button.addEventListener('click', function() { - sendDataToBackend(); - }); - - const targetElement = document.querySelector(TARGET_SELECTOR); - - if (targetElement && TARGET_SELECTOR !== 'body') { - const buttonContainer = document.createElement('div'); - buttonContainer.style.display = 'inline-block'; - buttonContainer.style.marginLeft = '10px'; - - button.style.position = 'relative'; - button.style.top = 'auto'; - button.style.right = 'auto'; - button.style.transform = 'none'; - button.style.margin = '0'; - - buttonContainer.appendChild(button); - - if (targetElement.nextSibling) { - targetElement.parentNode.insertBefore(buttonContainer, targetElement.nextSibling); - } else { - targetElement.parentNode.appendChild(buttonContainer); - } - } else { - document.body.appendChild(button); - } + // 全局配置 - 请根据实际情况修改这些值 + const BACKEND_IP = '127.0.0.1'; + const BACKEND_PORT = '5100'; + const BUTTON_LOCATION_SELECTOR = 'body'; + const DATA_LIST_SELECTOR = '#gdt a'; // 修改为a标签的选择器 + const ALL_IMG_DATA = {}; // 用于储存每一页的图片url, 格式为 {"0001": "https://example001.jpg", "0002": "https://example002.jpg"}, 最高支持4位数至9999 + + // 创建按钮 + const button = document.createElement('button'); + button.id = 'data-sender-button'; + button.textContent = "send data"; + button.style.position = "fixed"; + button.style.top = "32%"; + button.style.right = "1%"; + button.style.transform = "translateY(-50%)"; + button.style.padding = "3px 8px"; + button.style.fontSize = "10px"; + button.style.backgroundColor = "#007baf"; + button.style.color = "#fff"; + button.style.border = "none"; + button.style.borderRadius = "5px"; + button.style.cursor = "pointer"; + button.style.zIndex = "10000"; + + // 添加到指定位置 + const targetElement = document.querySelector(BUTTON_LOCATION_SELECTOR); + if (targetElement) { + targetElement.appendChild(button); + } else { + // 如果选择器找不到元素,默认添加到body + document.body.appendChild(button); } - function sendDataToBackend() { - const currentUrl = window.location.href; - const cookies = document.cookie; - - const data = { - url: currentUrl, - cookies: cookies, - timestamp: new Date().toISOString() - }; - - // 禁用按钮防止重复点击 - const button = document.getElementById('data-sender-button'); - if (button) { - button.disabled = true; - button.textContent = "任务进行中..."; - button.style.backgroundColor = "#6c757d"; + // 从页面中提取图片的函数 + function extractImagesFromPage(htmlContent) { + const images = []; + // 创建一个临时div来解析HTML + const tempDiv = document.createElement('div'); + tempDiv.innerHTML = htmlContent; + + if (DATA_LIST_SELECTOR) { + const linkElements = tempDiv.querySelectorAll(DATA_LIST_SELECTOR); + linkElements.forEach(link => { + // 从a标签中获取href属性,这通常是图片页面链接 + const href = link.href; + if (href) { + images.push(href); + } + }); } + return images; + } - // 发送任务请求 - GM_xmlhttpRequest({ - method: "POST", - url: `${BACKEND_BASE_URL}/start-crawl`, - headers: { - "Content-Type": "application/json" - }, - data: JSON.stringify(data), - onload: function(response) { - if (response.status === 200) { - const result = JSON.parse(response.responseText); - if (result.task_id) { - alert("爬虫任务已启动!任务ID: " + result.task_id); - // 开始轮询任务状态 - pollTaskStatus(result.task_id); - } else { - alert("任务启动失败: " + (result.message || "未知错误")); - resetButton(); - } - } else { - alert("请求失败,状态码: " + response.status); - resetButton(); - } - }, - onerror: function(error) { - console.error("数据发送失败:", error); - alert("数据发送失败,请检查后端服务是否运行"); - resetButton(); - } - }); + // 格式化数字为4位数 + function formatNumber(num) { + return num.toString().padStart(4, '0'); } - function pollTaskStatus(taskId) { - let pollCount = 0; - const maxPolls = 300; // 最多轮询300次(5分钟,每秒一次) + // 发送数据到后端的函数 + function sendDataToBackend(data) { + console.log('准备发送的数据:', data); + console.log('数据类型:', typeof data); + console.log('字符串化后的数据:', JSON.stringify(data)); - const pollInterval = setInterval(() => { - pollCount++; - + return new Promise((resolve, reject) => { GM_xmlhttpRequest({ - method: "GET", - url: `${BACKEND_BASE_URL}/task-status/${taskId}`, + method: "POST", + url: `http://${BACKEND_IP}:${BACKEND_PORT}/save_url`, + headers: { + "Content-Type": "application/json", + }, + data: JSON.stringify(data), onload: function(response) { + console.log('后端响应状态:', response.status); + console.log('后端响应内容:', response.responseText); if (response.status === 200) { - const result = JSON.parse(response.responseText); - - // 更新按钮状态显示进度 - const button = document.getElementById('data-sender-button'); - if (button) { - button.textContent = `任务中...${pollCount}s`; - } - - if (result.status === 'completed') { - clearInterval(pollInterval); - alert("爬虫任务完成!\n结果: " + JSON.stringify(result.result, null, 2)); - resetButton(); - } else if (result.status === 'failed') { - clearInterval(pollInterval); - alert("爬虫任务失败: " + result.error); - resetButton(); - } - // 如果状态是 'running',继续轮询 + resolve(response); } else { - console.error("获取任务状态失败:", response.status); + reject(new Error(`后端返回错误: ${response.status} - ${response.responseText}`)); } }, onerror: function(error) { - console.error("轮询任务状态失败:", error); + reject(error); } }); - - // 超过最大轮询次数,停止轮询 - if (pollCount >= maxPolls) { - clearInterval(pollInterval); - alert("任务超时,请稍后手动检查结果"); - resetButton(); - } - }, 1000); // 每秒轮询一次 + }); } - function resetButton() { - const button = document.getElementById('data-sender-button'); - if (button) { - button.disabled = false; - button.textContent = "send data"; - button.style.backgroundColor = "#007baf"; + // 点击事件处理 + button.addEventListener('click', async function() { + // 1. 获取当前URL和title + const currentUrl = window.location.href; + const pageTitle = document.title; + + // 清空之前的图片数据 + Object.keys(ALL_IMG_DATA).forEach(key => delete ALL_IMG_DATA[key]); + + let img_count = 1; + + // 首先处理当前页(第0页)的图片 + if (DATA_LIST_SELECTOR) { + const linkElements = document.querySelectorAll(DATA_LIST_SELECTOR); + linkElements.forEach(link => { + const href = link.href; + if (href) { + ALL_IMG_DATA[formatNumber(img_count)] = href; + img_count++; + } + }); } - } - // 初始尝试添加按钮 - addButton(); + // alert(`开始采集数据!\n当前页图片链接数量: ${Object.keys(ALL_IMG_DATA).length}\n开始采集其他页面...`); - // 使用MutationObserver监听DOM变化 - const observer = new MutationObserver(function(mutations) { - addButton(); - }); + // 处理单个页面的函数 + const processPage = async (page) => { + // 构建分页URL + let newTargetUrl; + if (currentUrl.includes('?')) { + newTargetUrl = currentUrl.replace(/([?&])p=\d+/, `$1p=${page}`); + if (!newTargetUrl.includes('p=')) { + newTargetUrl += `&p=${page}`; + } + } else { + newTargetUrl = currentUrl + `?p=${page}`; + } - observer.observe(document.body, { - childList: true, - subtree: true - }); + try { + // 使用GM_xmlhttpRequest发送请求 + const response = await new Promise((resolve, reject) => { + GM_xmlhttpRequest({ + method: "GET", + url: newTargetUrl, + headers: { + "Referer": currentUrl, + "Cookie": document.cookie + }, + onload: function(response) { + resolve(response); + }, + onerror: function(error) { + reject(error); + } + }); + }); + + // 从响应中提取图片链接 + const pageImages = extractImagesFromPage(response.responseText); + + if (pageImages.length === 0) { + console.log(`第${page}页没有图片,可能是最后一页`); + return false; // 没有图片,可能是最后一页 + } - if (document.readyState === 'loading') { - document.addEventListener('DOMContentLoaded', addButton); - } else { - addButton(); - } + // 检查是否有重复图片 + let hasNewImage = false; + pageImages.forEach(href => { + // 检查这个图片是否已经存在 + const isDuplicate = Object.values(ALL_IMG_DATA).includes(href); + if (!isDuplicate) { + ALL_IMG_DATA[formatNumber(img_count)] = href; + img_count++; + hasNewImage = true; + } + }); + + console.log(`第${page}页采集完成,获取到${pageImages.length}个图片链接,新增${hasNewImage ? '有新图片' : '全是重复图片'}`); + + return hasNewImage; // 返回是否有新图片 + } catch (error) { + console.error(`第${page}页采集失败:`, error); + return false; + } + }; + + // 从第1页开始采集,最多到100页 + let shouldContinue = true; + for (let page = 0; page <= 100; page++) { + if (!shouldContinue) break; + + const hasNewImages = await processPage(page); + + // 如果没有新图片,说明可能是最后一页了 + if (!hasNewImages && page > 0) { + console.log(`第${page}页没有新图片,可能已到最后一页,停止采集`); + shouldContinue = false; + } + + // 如果图片数量达到上限也停止 + if (img_count > 2200) { + console.log('图片数量达到上限2200,停止采集'); + shouldContinue = false; + } + } + + // 打包最终数据 + const data = { + url: currentUrl, + title: pageTitle, + all_images: ALL_IMG_DATA, + total_images: Object.keys(ALL_IMG_DATA).length + }; + + // 显示结果并发送到后端 + console.log('采集完成的所有数据:', data); + console.log('后端地址:', BACKEND_IP + ':' + BACKEND_PORT); + + try { + await sendDataToBackend(data); + alert(`数据采集完成并已保存到后端!\n标题: ${pageTitle}\n总图片链接数量: ${Object.keys(ALL_IMG_DATA).length}`); + } catch (error) { + console.error('发送数据到后端失败:', error); + alert(`数据采集完成但保存到后端失败!\n错误: ${error.message}\n请在控制台查看完整数据`); + } + }); })(); \ No newline at end of file