You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
229 lines
8.8 KiB
229 lines
8.8 KiB
import requests
|
|
import xml.etree.ElementTree as ET
|
|
import os
|
|
import sys
|
|
import argparse
|
|
|
|
def search_arxiv(query, max_results=10):
|
|
"""Search arXiv for papers"""
|
|
base_url = "http://export.arxiv.org/api/query"
|
|
params = {
|
|
'search_query': query,
|
|
'start': 0,
|
|
'max_results': max_results
|
|
}
|
|
|
|
response = requests.get(base_url, params=params)
|
|
return response.text
|
|
|
|
def get_paper_metadata(paper_id):
|
|
"""Get paper metadata directly from arXiv API"""
|
|
try:
|
|
# Use the arXiv API to get paper metadata
|
|
metadata_url = f"http://export.arxiv.org/api/query?id_list={paper_id}"
|
|
response = requests.get(metadata_url)
|
|
|
|
if response.status_code == 200:
|
|
papers = parse_search_results(response.text)
|
|
if papers and len(papers) > 0:
|
|
return papers[0]
|
|
return None
|
|
except Exception as e:
|
|
print(f"Error fetching paper metadata: {e}")
|
|
return None
|
|
|
|
def download_paper(paper_id, output_dir=".", paper_title=None):
|
|
"""Download a paper by its ID and rename it to the paper title"""
|
|
pdf_url = f"https://arxiv.org/pdf/{paper_id}.pdf"
|
|
response = requests.get(pdf_url)
|
|
|
|
if response.status_code == 200:
|
|
# Create filename from paper title if available, otherwise use paper ID
|
|
if paper_title:
|
|
# Clean the title for filename (remove special characters, limit length)
|
|
clean_title = "".join(c for c in paper_title if c.isalnum() or c in (' ', '-', '_')).rstrip()
|
|
clean_title = clean_title.replace(' ', '_')[:100] # Limit length to 100 chars
|
|
filename = f"{clean_title}.pdf"
|
|
else:
|
|
filename = f"{paper_id}.pdf"
|
|
|
|
filepath = os.path.join(output_dir, filename)
|
|
|
|
with open(filepath, 'wb') as f:
|
|
f.write(response.content)
|
|
print(f"Downloaded: {filepath}")
|
|
return filepath
|
|
else:
|
|
print(f"Failed to download paper {paper_id}")
|
|
return None
|
|
|
|
def parse_search_results(xml_content):
|
|
"""Parse XML search results and extract paper information"""
|
|
try:
|
|
root = ET.fromstring(xml_content)
|
|
papers = []
|
|
|
|
# Find all entry elements
|
|
for entry in root.findall('.//{http://www.w3.org/2005/Atom}entry'):
|
|
paper_info = {}
|
|
|
|
# Extract title
|
|
title_elem = entry.find('.//{http://www.w3.org/2005/Atom}title')
|
|
if title_elem is not None:
|
|
paper_info['title'] = title_elem.text.strip()
|
|
|
|
# Extract authors
|
|
authors = []
|
|
for author in entry.findall('.//{http://www.w3.org/2005/Atom}author'):
|
|
name_elem = author.find('.//{http://www.w3.org/2005/Atom}name')
|
|
if name_elem is not None:
|
|
authors.append(name_elem.text.strip())
|
|
paper_info['authors'] = authors
|
|
|
|
# Extract abstract
|
|
summary_elem = entry.find('.//{http://www.w3.org/2005/Atom}summary')
|
|
if summary_elem is not None:
|
|
paper_info['abstract'] = summary_elem.text.strip()
|
|
|
|
# Extract paper ID from the id field
|
|
id_elem = entry.find('.//{http://www.w3.org/2005/Atom}id')
|
|
if id_elem is not None:
|
|
# Extract ID from URL like "http://arxiv.org/abs/2103.12345"
|
|
paper_id = id_elem.text.split('/')[-1]
|
|
paper_info['paper_id'] = paper_id
|
|
|
|
# Extract published date
|
|
published_elem = entry.find('.//{http://www.w3.org/2005/Atom}published')
|
|
if published_elem is not None:
|
|
paper_info['published'] = published_elem.text.strip()
|
|
|
|
papers.append(paper_info)
|
|
|
|
return papers
|
|
except ET.ParseError as e:
|
|
print(f"Error parsing XML: {e}")
|
|
return []
|
|
|
|
def search_and_download(query, max_results=5, download_first=False):
|
|
"""Search for papers and optionally download the first result"""
|
|
print(f"Searching arXiv for: '{query}'")
|
|
print("-" * 50)
|
|
|
|
# Search for papers
|
|
results = search_arxiv(query, max_results)
|
|
papers = parse_search_results(results)
|
|
|
|
if not papers:
|
|
print("No papers found.")
|
|
return
|
|
|
|
# Display search results
|
|
print(f"Found {len(papers)} papers:\n")
|
|
for i, paper in enumerate(papers, 1):
|
|
print(f"{i}. Title: {paper.get('title', 'N/A')}")
|
|
print(f" Authors: {', '.join(paper.get('authors', ['N/A']))}")
|
|
print(f" Paper ID: {paper.get('paper_id', 'N/A')}")
|
|
print(f" Published: {paper.get('published', 'N/A')}")
|
|
print(f" Abstract: {paper.get('abstract', 'N/A')[:200]}...")
|
|
print()
|
|
|
|
# Optionally download first paper
|
|
if download_first and papers:
|
|
first_paper = papers[0]
|
|
paper_id = first_paper.get('paper_id')
|
|
paper_title = first_paper.get('title')
|
|
if paper_id:
|
|
print(f"Downloading first paper: {paper_id}")
|
|
download_paper(paper_id, paper_title=paper_title)
|
|
else:
|
|
print("Could not extract paper ID for download")
|
|
|
|
def interactive_mode():
|
|
"""Interactive mode for searching arXiv"""
|
|
print("🔍 arXiv Paper Search Tool")
|
|
print("=" * 40)
|
|
print("Commands:")
|
|
print(" search <query> [max_results] - Search for papers")
|
|
print(" download <paper_id> - Download a specific paper")
|
|
print(" help - Show this help message")
|
|
print(" quit/exit - Exit the program")
|
|
print()
|
|
|
|
while True:
|
|
try:
|
|
command = input("📚 arxiv> ").strip()
|
|
|
|
if not command:
|
|
continue
|
|
|
|
parts = command.split()
|
|
cmd = parts[0].lower()
|
|
|
|
if cmd in ['quit', 'exit', 'q']:
|
|
print("Goodbye! 👋")
|
|
break
|
|
|
|
elif cmd == 'help':
|
|
print("Commands:")
|
|
print(" search <query> [max_results] - Search for papers")
|
|
print(" download <paper_id> - Download a specific paper")
|
|
print(" help - Show this help message")
|
|
print(" quit/exit - Exit the program")
|
|
print()
|
|
|
|
elif cmd == 'search':
|
|
if len(parts) < 2:
|
|
print("Usage: search <query> [max_results]")
|
|
continue
|
|
|
|
query = ' '.join(parts[1:-1]) if len(parts) > 2 else parts[1]
|
|
max_results = int(parts[-1]) if len(parts) > 2 and parts[-1].isdigit() else 5
|
|
|
|
search_and_download(query, max_results, download_first=False)
|
|
|
|
elif cmd == 'download':
|
|
if len(parts) < 2:
|
|
print("Usage: download <paper_id>")
|
|
continue
|
|
|
|
paper_id = parts[1]
|
|
# Get paper metadata first
|
|
print(f"Fetching paper information for {paper_id}...")
|
|
paper_info = get_paper_metadata(paper_id)
|
|
|
|
if paper_info and paper_info.get('title'):
|
|
paper_title = paper_info['title']
|
|
print(f"Found paper: {paper_title}")
|
|
download_paper(paper_id, paper_title=paper_title)
|
|
else:
|
|
print(f"Could not find paper information for {paper_id}")
|
|
print("Downloading with paper ID as filename...")
|
|
download_paper(paper_id)
|
|
|
|
else:
|
|
print(f"Unknown command: {cmd}")
|
|
print("Type 'help' for available commands")
|
|
|
|
except KeyboardInterrupt:
|
|
print("\nGoodbye! 👋")
|
|
break
|
|
except Exception as e:
|
|
print(f"Error: {e}")
|
|
|
|
# Example usage
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser(description='Search and download papers from arXiv')
|
|
parser.add_argument('query', nargs='?', help='Search query')
|
|
parser.add_argument('-n', '--max_results', type=int, default=5, help='Maximum number of results (default: 5)')
|
|
parser.add_argument('-d', '--download', action='store_true', help='Download the first result')
|
|
parser.add_argument('-i', '--interactive', action='store_true', help='Start interactive mode')
|
|
|
|
args = parser.parse_args()
|
|
|
|
if args.interactive:
|
|
interactive_mode()
|
|
elif args.query:
|
|
search_and_download(args.query, args.max_results, args.download)
|
|
else:
|
|
# Default behavior - start interactive mode
|
|
interactive_mode()
|
|
|