parent
a16c47b303
commit
f9bf9826b4
@ -1,73 +0,0 @@ |
||||
from fastapi import APIRouter, BackgroundTasks |
||||
from pydantic import BaseModel |
||||
import uuid |
||||
import os |
||||
from pathlib import Path |
||||
|
||||
router = APIRouter(prefix="/api/v1", tags=["downloader"]) |
||||
|
||||
# 存储任务状态 |
||||
tasks = {} |
||||
|
||||
class CrawlRequest(BaseModel): |
||||
url: str |
||||
cookies: str |
||||
timestamp: str |
||||
|
||||
class TaskStatus(BaseModel): |
||||
status: str # 'running', 'completed', 'failed' |
||||
result: dict = None |
||||
error: str = None |
||||
|
||||
@router.post("/start-crawl") |
||||
async def start_crawl(request: CrawlRequest, background_tasks: BackgroundTasks): |
||||
task_id = str(uuid.uuid4()) |
||||
tasks[task_id] = {'status': 'running', 'result': None, 'error': None} |
||||
|
||||
# 在后台运行爬虫任务 |
||||
background_tasks.add_task(run_crawler, task_id, request) |
||||
|
||||
return {"task_id": task_id, "status": "started"} |
||||
|
||||
@router.get("/task-status/{task_id}") |
||||
async def get_task_status(task_id: str): |
||||
task = tasks.get(task_id) |
||||
if not task: |
||||
return {"status": "not_found"} |
||||
return task |
||||
|
||||
async def run_crawler(task_id: str, request: CrawlRequest): |
||||
try: |
||||
# 这里执行您的爬虫逻辑,模拟长时间运行 |
||||
# 例如:time.sleep(300) # 5分钟 |
||||
|
||||
# 确保 downloads 目录存在(双重保障) |
||||
downloads_dir = Path("downloads") |
||||
downloads_dir.mkdir(exist_ok=True) |
||||
|
||||
# 模拟下载文件到 downloads 目录 |
||||
filename = f"download_{task_id}.txt" |
||||
filepath = downloads_dir / filename |
||||
|
||||
with open(filepath, 'w', encoding='utf-8') as f: |
||||
f.write(f"URL: {request.url}\n") |
||||
f.write(f"Cookies: {request.cookies}\n") |
||||
f.write(f"Timestamp: {request.timestamp}\n") |
||||
f.write("Download completed successfully\n") |
||||
|
||||
# 爬虫完成后更新状态 |
||||
tasks[task_id] = { |
||||
'status': 'completed', |
||||
'result': { |
||||
'message': '爬虫完成', |
||||
'data': '您的爬虫结果', |
||||
'download_path': str(filepath) |
||||
}, |
||||
'error': None |
||||
} |
||||
except Exception as e: |
||||
tasks[task_id] = { |
||||
'status': 'failed', |
||||
'result': None, |
||||
'error': str(e) |
||||
} |
||||
@ -1,200 +1,229 @@ |
||||
// ==UserScript==
|
||||
// @name 数据发送工具
|
||||
// @name eh-v2
|
||||
// @namespace http://tampermonkey.net/
|
||||
// @version 1.0
|
||||
// @description 向本地后端发送当前页面的URL和Cookies
|
||||
// @author You
|
||||
// @version 0.1
|
||||
// @description 采集页面数据并发送到后端
|
||||
// @author Jack
|
||||
// @match *://*/*
|
||||
// @grant GM_xmlhttpRequest
|
||||
// @connect 127.0.0.1
|
||||
// @connect localhost
|
||||
// ==/UserScript==
|
||||
|
||||
(function() { |
||||
'use strict'; |
||||
|
||||
// 配置:您可以修改这些变量来自定义行为
|
||||
const TARGET_SELECTOR = 'body'; // 按钮插入位置的选择器
|
||||
const BACKEND_IP = '127.0.0.1'; // 后端IP地址
|
||||
const BACKEND_PORT = '5100'; // 后端端口号
|
||||
|
||||
// 构建后端基础URL
|
||||
const BACKEND_BASE_URL = `http://${BACKEND_IP}:${BACKEND_PORT}`; |
||||
|
||||
function addButton() { |
||||
if (document.getElementById('data-sender-button')) { |
||||
return; |
||||
} |
||||
|
||||
const button = document.createElement('button'); |
||||
button.id = 'data-sender-button'; |
||||
button.textContent = "send data"; |
||||
button.style.position = "fixed"; |
||||
button.style.top = "12.5%"; |
||||
button.style.right = "1%"; |
||||
button.style.transform = "translateY(-50%)"; |
||||
button.style.padding = "3px 8px"; |
||||
button.style.fontSize = "10px"; |
||||
button.style.backgroundColor = "#007baf"; |
||||
button.style.color = "#fff"; |
||||
button.style.border = "none"; |
||||
button.style.borderRadius = "5px"; |
||||
button.style.cursor = "pointer"; |
||||
button.style.zIndex = "10000"; |
||||
|
||||
button.addEventListener('click', function() { |
||||
sendDataToBackend(); |
||||
}); |
||||
|
||||
const targetElement = document.querySelector(TARGET_SELECTOR); |
||||
|
||||
if (targetElement && TARGET_SELECTOR !== 'body') { |
||||
const buttonContainer = document.createElement('div'); |
||||
buttonContainer.style.display = 'inline-block'; |
||||
buttonContainer.style.marginLeft = '10px'; |
||||
|
||||
button.style.position = 'relative'; |
||||
button.style.top = 'auto'; |
||||
button.style.right = 'auto'; |
||||
button.style.transform = 'none'; |
||||
button.style.margin = '0'; |
||||
|
||||
buttonContainer.appendChild(button); |
||||
|
||||
if (targetElement.nextSibling) { |
||||
targetElement.parentNode.insertBefore(buttonContainer, targetElement.nextSibling); |
||||
} else { |
||||
targetElement.parentNode.appendChild(buttonContainer); |
||||
} |
||||
} else { |
||||
document.body.appendChild(button); |
||||
} |
||||
// 全局配置 - 请根据实际情况修改这些值
|
||||
const BACKEND_IP = '127.0.0.1'; |
||||
const BACKEND_PORT = '5100'; |
||||
const BUTTON_LOCATION_SELECTOR = 'body'; |
||||
const DATA_LIST_SELECTOR = '#gdt a'; // 修改为a标签的选择器
|
||||
const ALL_IMG_DATA = {}; // 用于储存每一页的图片url, 格式为 {"0001": "https://example001.jpg", "0002": "https://example002.jpg"}, 最高支持4位数至9999
|
||||
|
||||
// 创建按钮
|
||||
const button = document.createElement('button'); |
||||
button.id = 'data-sender-button'; |
||||
button.textContent = "send data"; |
||||
button.style.position = "fixed"; |
||||
button.style.top = "32%"; |
||||
button.style.right = "1%"; |
||||
button.style.transform = "translateY(-50%)"; |
||||
button.style.padding = "3px 8px"; |
||||
button.style.fontSize = "10px"; |
||||
button.style.backgroundColor = "#007baf"; |
||||
button.style.color = "#fff"; |
||||
button.style.border = "none"; |
||||
button.style.borderRadius = "5px"; |
||||
button.style.cursor = "pointer"; |
||||
button.style.zIndex = "10000"; |
||||
|
||||
// 添加到指定位置
|
||||
const targetElement = document.querySelector(BUTTON_LOCATION_SELECTOR); |
||||
if (targetElement) { |
||||
targetElement.appendChild(button); |
||||
} else { |
||||
// 如果选择器找不到元素,默认添加到body
|
||||
document.body.appendChild(button); |
||||
} |
||||
|
||||
function sendDataToBackend() { |
||||
const currentUrl = window.location.href; |
||||
const cookies = document.cookie; |
||||
|
||||
const data = { |
||||
url: currentUrl, |
||||
cookies: cookies, |
||||
timestamp: new Date().toISOString() |
||||
}; |
||||
|
||||
// 禁用按钮防止重复点击
|
||||
const button = document.getElementById('data-sender-button'); |
||||
if (button) { |
||||
button.disabled = true; |
||||
button.textContent = "任务进行中..."; |
||||
button.style.backgroundColor = "#6c757d"; |
||||
// 从页面中提取图片的函数
|
||||
function extractImagesFromPage(htmlContent) { |
||||
const images = []; |
||||
// 创建一个临时div来解析HTML
|
||||
const tempDiv = document.createElement('div'); |
||||
tempDiv.innerHTML = htmlContent; |
||||
|
||||
if (DATA_LIST_SELECTOR) { |
||||
const linkElements = tempDiv.querySelectorAll(DATA_LIST_SELECTOR); |
||||
linkElements.forEach(link => { |
||||
// 从a标签中获取href属性,这通常是图片页面链接
|
||||
const href = link.href; |
||||
if (href) { |
||||
images.push(href); |
||||
} |
||||
}); |
||||
} |
||||
return images; |
||||
} |
||||
|
||||
// 发送任务请求
|
||||
GM_xmlhttpRequest({ |
||||
method: "POST", |
||||
url: `${BACKEND_BASE_URL}/start-crawl`, |
||||
headers: { |
||||
"Content-Type": "application/json" |
||||
}, |
||||
data: JSON.stringify(data), |
||||
onload: function(response) { |
||||
if (response.status === 200) { |
||||
const result = JSON.parse(response.responseText); |
||||
if (result.task_id) { |
||||
alert("爬虫任务已启动!任务ID: " + result.task_id); |
||||
// 开始轮询任务状态
|
||||
pollTaskStatus(result.task_id); |
||||
} else { |
||||
alert("任务启动失败: " + (result.message || "未知错误")); |
||||
resetButton(); |
||||
} |
||||
} else { |
||||
alert("请求失败,状态码: " + response.status); |
||||
resetButton(); |
||||
} |
||||
}, |
||||
onerror: function(error) { |
||||
console.error("数据发送失败:", error); |
||||
alert("数据发送失败,请检查后端服务是否运行"); |
||||
resetButton(); |
||||
} |
||||
}); |
||||
// 格式化数字为4位数
|
||||
function formatNumber(num) { |
||||
return num.toString().padStart(4, '0'); |
||||
} |
||||
|
||||
function pollTaskStatus(taskId) { |
||||
let pollCount = 0; |
||||
const maxPolls = 300; // 最多轮询300次(5分钟,每秒一次)
|
||||
// 发送数据到后端的函数
|
||||
function sendDataToBackend(data) { |
||||
console.log('准备发送的数据:', data); |
||||
console.log('数据类型:', typeof data); |
||||
console.log('字符串化后的数据:', JSON.stringify(data)); |
||||
|
||||
const pollInterval = setInterval(() => { |
||||
pollCount++; |
||||
|
||||
return new Promise((resolve, reject) => { |
||||
GM_xmlhttpRequest({ |
||||
method: "GET", |
||||
url: `${BACKEND_BASE_URL}/task-status/${taskId}`, |
||||
method: "POST", |
||||
url: `http://${BACKEND_IP}:${BACKEND_PORT}/save_url`, |
||||
headers: { |
||||
"Content-Type": "application/json", |
||||
}, |
||||
data: JSON.stringify(data), |
||||
onload: function(response) { |
||||
console.log('后端响应状态:', response.status); |
||||
console.log('后端响应内容:', response.responseText); |
||||
if (response.status === 200) { |
||||
const result = JSON.parse(response.responseText); |
||||
|
||||
// 更新按钮状态显示进度
|
||||
const button = document.getElementById('data-sender-button'); |
||||
if (button) { |
||||
button.textContent = `任务中...${pollCount}s`; |
||||
} |
||||
|
||||
if (result.status === 'completed') { |
||||
clearInterval(pollInterval); |
||||
alert("爬虫任务完成!\n结果: " + JSON.stringify(result.result, null, 2)); |
||||
resetButton(); |
||||
} else if (result.status === 'failed') { |
||||
clearInterval(pollInterval); |
||||
alert("爬虫任务失败: " + result.error); |
||||
resetButton(); |
||||
} |
||||
// 如果状态是 'running',继续轮询
|
||||
resolve(response); |
||||
} else { |
||||
console.error("获取任务状态失败:", response.status); |
||||
reject(new Error(`后端返回错误: ${response.status} - ${response.responseText}`)); |
||||
} |
||||
}, |
||||
onerror: function(error) { |
||||
console.error("轮询任务状态失败:", error); |
||||
reject(error); |
||||
} |
||||
}); |
||||
|
||||
// 超过最大轮询次数,停止轮询
|
||||
if (pollCount >= maxPolls) { |
||||
clearInterval(pollInterval); |
||||
alert("任务超时,请稍后手动检查结果"); |
||||
resetButton(); |
||||
} |
||||
}, 1000); // 每秒轮询一次
|
||||
}); |
||||
} |
||||
|
||||
function resetButton() { |
||||
const button = document.getElementById('data-sender-button'); |
||||
if (button) { |
||||
button.disabled = false; |
||||
button.textContent = "send data"; |
||||
button.style.backgroundColor = "#007baf"; |
||||
// 点击事件处理
|
||||
button.addEventListener('click', async function() { |
||||
// 1. 获取当前URL和title
|
||||
const currentUrl = window.location.href; |
||||
const pageTitle = document.title; |
||||
|
||||
// 清空之前的图片数据
|
||||
Object.keys(ALL_IMG_DATA).forEach(key => delete ALL_IMG_DATA[key]); |
||||
|
||||
let img_count = 1; |
||||
|
||||
// 首先处理当前页(第0页)的图片
|
||||
if (DATA_LIST_SELECTOR) { |
||||
const linkElements = document.querySelectorAll(DATA_LIST_SELECTOR); |
||||
linkElements.forEach(link => { |
||||
const href = link.href; |
||||
if (href) { |
||||
ALL_IMG_DATA[formatNumber(img_count)] = href; |
||||
img_count++; |
||||
} |
||||
}); |
||||
} |
||||
} |
||||
|
||||
// 初始尝试添加按钮
|
||||
addButton(); |
||||
// alert(`开始采集数据!\n当前页图片链接数量: ${Object.keys(ALL_IMG_DATA).length}\n开始采集其他页面...`);
|
||||
|
||||
// 使用MutationObserver监听DOM变化
|
||||
const observer = new MutationObserver(function(mutations) { |
||||
addButton(); |
||||
}); |
||||
// 处理单个页面的函数
|
||||
const processPage = async (page) => { |
||||
// 构建分页URL
|
||||
let newTargetUrl; |
||||
if (currentUrl.includes('?')) { |
||||
newTargetUrl = currentUrl.replace(/([?&])p=\d+/, `$1p=${page}`); |
||||
if (!newTargetUrl.includes('p=')) { |
||||
newTargetUrl += `&p=${page}`; |
||||
} |
||||
} else { |
||||
newTargetUrl = currentUrl + `?p=${page}`; |
||||
} |
||||
|
||||
observer.observe(document.body, { |
||||
childList: true, |
||||
subtree: true |
||||
}); |
||||
try { |
||||
// 使用GM_xmlhttpRequest发送请求
|
||||
const response = await new Promise((resolve, reject) => { |
||||
GM_xmlhttpRequest({ |
||||
method: "GET", |
||||
url: newTargetUrl, |
||||
headers: { |
||||
"Referer": currentUrl, |
||||
"Cookie": document.cookie |
||||
}, |
||||
onload: function(response) { |
||||
resolve(response); |
||||
}, |
||||
onerror: function(error) { |
||||
reject(error); |
||||
} |
||||
}); |
||||
}); |
||||
|
||||
// 从响应中提取图片链接
|
||||
const pageImages = extractImagesFromPage(response.responseText); |
||||
|
||||
if (pageImages.length === 0) { |
||||
console.log(`第${page}页没有图片,可能是最后一页`); |
||||
return false; // 没有图片,可能是最后一页
|
||||
} |
||||
|
||||
if (document.readyState === 'loading') { |
||||
document.addEventListener('DOMContentLoaded', addButton); |
||||
} else { |
||||
addButton(); |
||||
} |
||||
// 检查是否有重复图片
|
||||
let hasNewImage = false; |
||||
pageImages.forEach(href => { |
||||
// 检查这个图片是否已经存在
|
||||
const isDuplicate = Object.values(ALL_IMG_DATA).includes(href); |
||||
if (!isDuplicate) { |
||||
ALL_IMG_DATA[formatNumber(img_count)] = href; |
||||
img_count++; |
||||
hasNewImage = true; |
||||
} |
||||
}); |
||||
|
||||
console.log(`第${page}页采集完成,获取到${pageImages.length}个图片链接,新增${hasNewImage ? '有新图片' : '全是重复图片'}`); |
||||
|
||||
return hasNewImage; // 返回是否有新图片
|
||||
|
||||
} catch (error) { |
||||
console.error(`第${page}页采集失败:`, error); |
||||
return false; |
||||
} |
||||
}; |
||||
|
||||
// 从第1页开始采集,最多到100页
|
||||
let shouldContinue = true; |
||||
for (let page = 0; page <= 100; page++) { |
||||
if (!shouldContinue) break; |
||||
|
||||
const hasNewImages = await processPage(page); |
||||
|
||||
// 如果没有新图片,说明可能是最后一页了
|
||||
if (!hasNewImages && page > 0) { |
||||
console.log(`第${page}页没有新图片,可能已到最后一页,停止采集`); |
||||
shouldContinue = false; |
||||
} |
||||
|
||||
// 如果图片数量达到上限也停止
|
||||
if (img_count > 2200) { |
||||
console.log('图片数量达到上限2200,停止采集'); |
||||
shouldContinue = false; |
||||
} |
||||
} |
||||
|
||||
// 打包最终数据
|
||||
const data = { |
||||
url: currentUrl, |
||||
title: pageTitle, |
||||
all_images: ALL_IMG_DATA, |
||||
total_images: Object.keys(ALL_IMG_DATA).length |
||||
}; |
||||
|
||||
// 显示结果并发送到后端
|
||||
console.log('采集完成的所有数据:', data); |
||||
console.log('后端地址:', BACKEND_IP + ':' + BACKEND_PORT); |
||||
|
||||
try { |
||||
await sendDataToBackend(data); |
||||
alert(`数据采集完成并已保存到后端!\n标题: ${pageTitle}\n总图片链接数量: ${Object.keys(ALL_IMG_DATA).length}`); |
||||
} catch (error) { |
||||
console.error('发送数据到后端失败:', error); |
||||
alert(`数据采集完成但保存到后端失败!\n错误: ${error.message}\n请在控制台查看完整数据`); |
||||
} |
||||
}); |
||||
})(); |
||||
Loading…
Reference in new issue