You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

229 lines
8.8 KiB

import requests
import xml.etree.ElementTree as ET
import os
import sys
import argparse
def search_arxiv(query, max_results=10):
"""Search arXiv for papers"""
base_url = "http://export.arxiv.org/api/query"
params = {
'search_query': query,
'start': 0,
'max_results': max_results
}
response = requests.get(base_url, params=params)
return response.text
def get_paper_metadata(paper_id):
"""Get paper metadata directly from arXiv API"""
try:
# Use the arXiv API to get paper metadata
metadata_url = f"http://export.arxiv.org/api/query?id_list={paper_id}"
response = requests.get(metadata_url)
if response.status_code == 200:
papers = parse_search_results(response.text)
if papers and len(papers) > 0:
return papers[0]
return None
except Exception as e:
print(f"Error fetching paper metadata: {e}")
return None
def download_paper(paper_id, output_dir=".", paper_title=None):
"""Download a paper by its ID and rename it to the paper title"""
pdf_url = f"https://arxiv.org/pdf/{paper_id}.pdf"
response = requests.get(pdf_url)
if response.status_code == 200:
# Create filename from paper title if available, otherwise use paper ID
if paper_title:
# Clean the title for filename (remove special characters, limit length)
clean_title = "".join(c for c in paper_title if c.isalnum() or c in (' ', '-', '_')).rstrip()
clean_title = clean_title.replace(' ', '_')[:100] # Limit length to 100 chars
filename = f"{clean_title}.pdf"
else:
filename = f"{paper_id}.pdf"
filepath = os.path.join(output_dir, filename)
with open(filepath, 'wb') as f:
f.write(response.content)
print(f"Downloaded: {filepath}")
return filepath
else:
print(f"Failed to download paper {paper_id}")
return None
def parse_search_results(xml_content):
"""Parse XML search results and extract paper information"""
try:
root = ET.fromstring(xml_content)
papers = []
# Find all entry elements
for entry in root.findall('.//{http://www.w3.org/2005/Atom}entry'):
paper_info = {}
# Extract title
title_elem = entry.find('.//{http://www.w3.org/2005/Atom}title')
if title_elem is not None:
paper_info['title'] = title_elem.text.strip()
# Extract authors
authors = []
for author in entry.findall('.//{http://www.w3.org/2005/Atom}author'):
name_elem = author.find('.//{http://www.w3.org/2005/Atom}name')
if name_elem is not None:
authors.append(name_elem.text.strip())
paper_info['authors'] = authors
# Extract abstract
summary_elem = entry.find('.//{http://www.w3.org/2005/Atom}summary')
if summary_elem is not None:
paper_info['abstract'] = summary_elem.text.strip()
# Extract paper ID from the id field
id_elem = entry.find('.//{http://www.w3.org/2005/Atom}id')
if id_elem is not None:
# Extract ID from URL like "http://arxiv.org/abs/2103.12345"
paper_id = id_elem.text.split('/')[-1]
paper_info['paper_id'] = paper_id
# Extract published date
published_elem = entry.find('.//{http://www.w3.org/2005/Atom}published')
if published_elem is not None:
paper_info['published'] = published_elem.text.strip()
papers.append(paper_info)
return papers
except ET.ParseError as e:
print(f"Error parsing XML: {e}")
return []
def search_and_download(query, max_results=5, download_first=False):
"""Search for papers and optionally download the first result"""
print(f"Searching arXiv for: '{query}'")
print("-" * 50)
# Search for papers
results = search_arxiv(query, max_results)
papers = parse_search_results(results)
if not papers:
print("No papers found.")
return
# Display search results
print(f"Found {len(papers)} papers:\n")
for i, paper in enumerate(papers, 1):
print(f"{i}. Title: {paper.get('title', 'N/A')}")
print(f" Authors: {', '.join(paper.get('authors', ['N/A']))}")
print(f" Paper ID: {paper.get('paper_id', 'N/A')}")
print(f" Published: {paper.get('published', 'N/A')}")
print(f" Abstract: {paper.get('abstract', 'N/A')[:200]}...")
print()
# Optionally download first paper
if download_first and papers:
first_paper = papers[0]
paper_id = first_paper.get('paper_id')
paper_title = first_paper.get('title')
if paper_id:
print(f"Downloading first paper: {paper_id}")
download_paper(paper_id, paper_title=paper_title)
else:
print("Could not extract paper ID for download")
def interactive_mode():
"""Interactive mode for searching arXiv"""
print("🔍 arXiv Paper Search Tool")
print("=" * 40)
print("Commands:")
print(" search <query> [max_results] - Search for papers")
print(" download <paper_id> - Download a specific paper")
print(" help - Show this help message")
print(" quit/exit - Exit the program")
print()
while True:
try:
command = input("📚 arxiv> ").strip()
if not command:
continue
parts = command.split()
cmd = parts[0].lower()
if cmd in ['quit', 'exit', 'q']:
print("Goodbye! 👋")
break
elif cmd == 'help':
print("Commands:")
print(" search <query> [max_results] - Search for papers")
print(" download <paper_id> - Download a specific paper")
print(" help - Show this help message")
print(" quit/exit - Exit the program")
print()
elif cmd == 'search':
if len(parts) < 2:
print("Usage: search <query> [max_results]")
continue
query = ' '.join(parts[1:-1]) if len(parts) > 2 else parts[1]
max_results = int(parts[-1]) if len(parts) > 2 and parts[-1].isdigit() else 5
search_and_download(query, max_results, download_first=False)
elif cmd == 'download':
if len(parts) < 2:
print("Usage: download <paper_id>")
continue
paper_id = parts[1]
# Get paper metadata first
print(f"Fetching paper information for {paper_id}...")
paper_info = get_paper_metadata(paper_id)
if paper_info and paper_info.get('title'):
paper_title = paper_info['title']
print(f"Found paper: {paper_title}")
download_paper(paper_id, paper_title=paper_title)
else:
print(f"Could not find paper information for {paper_id}")
print("Downloading with paper ID as filename...")
download_paper(paper_id)
else:
print(f"Unknown command: {cmd}")
print("Type 'help' for available commands")
except KeyboardInterrupt:
print("\nGoodbye! 👋")
break
except Exception as e:
print(f"Error: {e}")
# Example usage
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Search and download papers from arXiv')
parser.add_argument('query', nargs='?', help='Search query')
parser.add_argument('-n', '--max_results', type=int, default=5, help='Maximum number of results (default: 5)')
parser.add_argument('-d', '--download', action='store_true', help='Download the first result')
parser.add_argument('-i', '--interactive', action='store_true', help='Start interactive mode')
args = parser.parse_args()
if args.interactive:
interactive_mode()
elif args.query:
search_and_download(args.query, args.max_results, args.download)
else:
# Default behavior - start interactive mode
interactive_mode()