first commit

main
jack 4 weeks ago
commit 73b8361880
  1. 4
      .gitignore
  2. 5
      README.md
  3. 291
      kaizty_spider.py
  4. 1
      keys.txt
  5. 56
      random_proxy.py

4
.gitignore vendored

@ -0,0 +1,4 @@
.DS_Store
__pycache__/
*.pyc
.idea

@ -0,0 +1,5 @@
# spider_kaizty
# 目标网站: https://www.kaizty.com
# 依赖: pip install httpx

@ -0,0 +1,291 @@
import time
import asyncio
import random
import re
import json
import os
import concurrent.futures
import httpx
max_workers = 2
proxies="http://127.0.0.1:7890"
def check_urls_json_exists(key):
downloads_path = os.path.join(os.getcwd(), "downloads")
for root, dirs, files in os.walk(downloads_path):
if f"{key}.json" in files:
json_path = root.split('/')[-1]
print(f'json文件已存在 {json_path}')
return True
return False
def check_and_load_keys():
# 从 keys.txt 文件中读取 key
keys = []
keys_file = os.path.join(os.getcwd(), "keys.txt")
if not os.path.exists(keys_file):
print("keys.txt 文件不存在\n新建keys.txt文件。")
with open(keys_file, "w", encoding="utf-8") as f:
f.write("")
exit(0)
with open(keys_file, "r", encoding="utf-8") as f:
keys = [line.strip() for line in f.readlines()]
if keys:
return keys
else:
print("keys.txt 文件为空\n请填写key。")
exit(0)
async def fetch_page(client, url):
try:
response = await client.get(url)
response.raise_for_status() # 检查请求是否成功
return response.text
except httpx.HTTPError as e:
print(f"请求失败: {e}")
return None
def extract_image_links(content):
# 使用正则表达式提取图片链接
pattern = r'<meta itemprop="image" content="(.*?)">'
image_links = re.findall(pattern, content)
return image_links
def clean_folder_name(title):
# 清洗标题,使其成为 Windows 文件夹合法字符
invalid_chars = r'[<>:"/\\|?*\x00-\x1F]'
title = re.sub(invalid_chars, '_', title) # 替换非法字符为下划线
title = title.replace(" ", "") # 删除空格
title = title.replace("_", "") # 删除下划线
return title.strip()
async def get_urls(key):
# 这里判定一下, 这个 key 是否已经爬取过
is_exists = check_urls_json_exists(key)
if is_exists:
print(f"{key}.json 文件已存在,跳过爬取。")
return
base_url = f"https://www.kaizty.com/photos/{key}.html?page="
data = {}
folder_name = "default_folder" # 默认文件夹名
async with httpx.AsyncClient(proxies=proxies) as client:
n = 1
for page in range(1, 30):
url = base_url + str(page)
print(f"正在爬取页面: {url}")
content = await fetch_page(client, url)
if content is None:
print(f"无法获取页面内容: {url}")
continue
# 检查页面内容是否为空
if "EMPTY" in content:
print("页面内容为空,停止爬取。")
break
# 获取标题(仅在第一页获取)
if page == 1:
title_pattern = r'<title>(.*?)</title>'
title_match = re.search(title_pattern, content)
if title_match:
title = title_match.group(1)
folder_name = clean_folder_name(title)
print(f"页面标题: {title}")
print(f"清洗后的文件夹名: {folder_name}")
else:
print("无法获取页面标题,使用默认文件夹名。")
# 提取图片链接
image_links = extract_image_links(content)
if image_links:
print(f"在页面 {url} 中找到图片链接:")
for link in image_links:
print(link)
prefix = str(n).zfill(3)
suffix = link.split('.')[-1]
img_name = f'{prefix}.{suffix}'
data[img_name] = link
n += 1
else:
print(f"页面 {url} 中未找到图片链接。")
# 创建文件夹并保存数据
downloads_path = os.path.join(os.getcwd(), "downloads")
if not os.path.exists(downloads_path):
os.makedirs(downloads_path)
print("创建了 downloads 文件夹。")
folder_path = os.path.join(downloads_path, folder_name)
if not os.path.exists(folder_path):
os.makedirs(folder_path)
print(f"创建了文件夹: {folder_path}")
data_file_path = os.path.join(folder_path, f"{key}.json")
with open(data_file_path, "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=4)
print(f"数据已保存到 {data_file_path}")
return [folder_name, data_file_path]
def load_imgs_url_and_patn():
result = []
downloads_path = os.path.join(os.getcwd(), "downloads")
for root, dirs, files in os.walk(downloads_path):
for file in files:
if file.endswith(".json"):
json_path = os.path.join(root, file)
with open(json_path, "r", encoding="utf-8") as f:
data = json.load(f)
for img_name, img_url in data.items():
img_path = os.path.join(root, img_name)
if not os.path.exists(img_path):
result.append([img_path, img_url])
return result
def save_img(client, img_path, img_url, max_retries=999):
retries = 0
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"Accept-Encoding": "gzip, deflate, br, zstd",
"Accept-Language": "zh-CN,zh;q=0.9",
"Priority": "u=0, i",
"Sec-CH-UA": '"Chromium";v="134", "Not:A-Brand";v="24", "Google Chrome";v="134"',
"Sec-CH-UA-Mobile": "?1",
"Sec-CH-UA-Platform": '"Android"',
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Sec-Fetch-User": "?1",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Mobile Safari/537.36"
}
while retries < max_retries:
try:
# 使用传入的 client 下载图片,并设置请求头
response = client.get(img_url, headers=headers, timeout=10)
response.raise_for_status() # 检查请求是否成功
# 保存图片到指定路径
os.makedirs(os.path.dirname(img_path), exist_ok=True)
with open(img_path, "wb") as f:
f.write(response.content)
print(f"图片已下载并保存到 {img_path}")
time.sleep(random.uniform(1, 1.5))
return # 成功下载后退出函数
except httpx.HTTPStatusError as e:
switch_to_random_proxy()
if e.response.status_code == 429:
# 如果是 429 错误,获取 Retry-After 时间
retry_after = int(e.response.headers.get('Retry-After', 3))
print(f"遇到 429 错误,等待 {retry_after} 秒后重试...")
time.sleep(retry_after)
retries += 1
else:
print(f"下载图片失败: {img_path.split('/')[-1]},错误码: {e.response.status_code}")
break
except Exception as e:
print(f"保存图片时发生错误: {e}")
break
if retries == max_retries:
print(f"图片下载失败,已达到最大重试次数: {img_path}")
def switch_to_random_proxy(clash_api_url="http://127.0.0.1:9090", group_name="GLOBAL"):
"""
随机切换代理组中的一个节点排除当前节点和 DIRECT/REJECT
:param clash_api_url: Clash RESTful API 地址默认为 "http://127.0.0.1:9090"
:param group_name: 代理组名称默认为 "GLOBAL"
"""
try:
# 获取代理组的所有节点
response = httpx.get(f"{clash_api_url}/proxies")
response.raise_for_status()
proxies = response.json()
if group_name not in proxies['proxies']:
print(f"代理组 '{group_name}' 不存在")
return
group_info = proxies['proxies'][group_name]
if group_info['type'] != 'Selector':
print(f"'{group_name}' 不是 Selector 类型的代理组")
return
# 获取当前使用的节点
current_node = group_info['now']
print(f"当前节点: {current_node}")
# 获取所有可选节点(排除 DIRECT 和 REJECT)
nodes = [node for node in group_info['all'] if node not in ["DIRECT", "REJECT"]]
if not nodes:
print("没有可用的代理节点")
return
# 随机选择一个非当前节点的代理
available_nodes = [node for node in nodes if node != current_node]
if not available_nodes:
print("没有其他可用的代理节点")
return
random_node = random.choice(available_nodes)
print(f"正在切换到随机节点: {random_node}")
# 切换节点
switch_url = f"{clash_api_url}/proxies/{group_name}"
response = httpx.put(switch_url, json={"name": random_node})
if response.status_code == 204:
print(f"成功切换到节点: {random_node}")
else:
print(f"切换节点失败: {response.status_code}")
except httpx.exceptions.RequestException as e:
print(f"请求失败: {e}")
def main():
keys = check_and_load_keys()
# 在这里获取当前路径并且判定如果没有downloads文件夹的话,就创建一个
downloads_path = os.path.join(os.getcwd(), "downloads")
if not os.path.exists(downloads_path):
os.makedirs(downloads_path)
print("创建了 downloads 文件夹。")
for key in keys:
# 调用异步函数
result = asyncio.run(get_urls(key))
if result:
folder_name = result[0]
data_file_path = result[1]
print(f"处理完成,文件夹名称: {folder_name}, 数据保存路径: {data_file_path}")
print(f'已获取全部keys的url数据, 开始下载图片')
time.sleep(0.1)
all_data = load_imgs_url_and_patn()
# 创建一个全局的 httpx.Client 实例
with httpx.Client(proxies=proxies) as client:
# 使用线程池并发下载图片
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
futures = []
for img_path, img_url in all_data:
futures.append(executor.submit(save_img, client, img_path, img_url))
# 等待所有线程完成
for future in concurrent.futures.as_completed(futures):
future.result() # 捕获异常(如果有)
print("所有图片下载完成!")
if __name__ == "__main__":
main()

@ -0,0 +1 @@
Y0VRSUQ2NFgvdkVTNVNPOHJJUW9Idz09

@ -0,0 +1,56 @@
import httpx
import random
def switch_to_random_proxy(clash_api_url="http://127.0.0.1:9090", group_name="GLOBAL"):
"""
随机切换代理组中的一个节点排除当前节点和 DIRECT/REJECT
:param clash_api_url: Clash RESTful API 地址默认为 "http://127.0.0.1:9090"
:param group_name: 代理组名称默认为 "GLOBAL"
"""
try:
# 获取代理组的所有节点
response = httpx.get(f"{clash_api_url}/proxies")
response.raise_for_status()
proxies = response.json()
if group_name not in proxies['proxies']:
print(f"代理组 '{group_name}' 不存在")
return
group_info = proxies['proxies'][group_name]
if group_info['type'] != 'Selector':
print(f"'{group_name}' 不是 Selector 类型的代理组")
return
# 获取当前使用的节点
current_node = group_info['now']
print(f"当前节点: {current_node}")
# 获取所有可选节点(排除 DIRECT 和 REJECT)
nodes = [node for node in group_info['all'] if node not in ["DIRECT", "REJECT"]]
if not nodes:
print("没有可用的代理节点")
return
# 随机选择一个非当前节点的代理
available_nodes = [node for node in nodes if node != current_node]
if not available_nodes:
print("没有其他可用的代理节点")
return
random_node = random.choice(available_nodes)
print(f"正在切换到随机节点: {random_node}")
# 切换节点
switch_url = f"{clash_api_url}/proxies/{group_name}"
response = httpx.put(switch_url, json={"name": random_node})
if response.status_code == 204:
print(f"成功切换到节点: {random_node}")
else:
print(f"切换节点失败: {response.status_code}")
except httpx.exceptions.RequestException as e:
print(f"请求失败: {e}")
switch_to_random_proxy()
Loading…
Cancel
Save