alpha_tools/backup_code/wqb-server2/untracked/APP/blueprints/paper_analysis.py

"""
Paper Analysis Blueprint - Flask Blueprint for analyzing research papers using Deepseek AI
"""

from flask import Blueprint, render_template, request, jsonify
import requests
import json
import os
import tempfile
from werkzeug.utils import secure_filename

# Create blueprint
paper_analysis_bp = Blueprint('paper_analysis', __name__, url_prefix='/paper-analysis')

@paper_analysis_bp.route('/')
def paper_analysis():
    """Paper analysis page"""
    return render_template('paper_analysis.html')

@paper_analysis_bp.route('/api/test-deepseek', methods=['POST'])
def test_deepseek():
    """Test Deepseek API connection"""
    try:
        api_key = request.headers.get('X-API-Key')
        if not api_key:
            return jsonify({'error': 'API key is required'}), 401

        # Test API with a simple prompt
        headers = {
            'Authorization': f'Bearer {api_key}',
            'Content-Type': 'application/json'
        }

        test_response = requests.post(
            'https://api.deepseek.com/v1/chat/completions',  # Using chat completions endpoint
            headers=headers,
            json={
                'model': 'deepseek-chat',
                'messages': [
                    {'role': 'user', 'content': 'Say hello'}
                ],
                'max_tokens': 10
            },
            timeout=10
        )

        if test_response.ok:
            return jsonify({
                'success': True,
                'message': 'Deepseek API connection successful',
                'response': test_response.json()
            })
        else:
            return jsonify({
                'success': False,
                'error': f'API Error: {test_response.status_code}',
                'details': test_response.text
            }), test_response.status_code

    except requests.exceptions.RequestException as e:
        return jsonify({
            'success': False,
            'error': 'Connection error',
            'details': str(e)
        }), 500
    except Exception as e:
        return jsonify({
            'success': False,
            'error': 'Unexpected error',
            'details': str(e)
        }), 500

@paper_analysis_bp.route('/api/analyze-paper', methods=['POST'])
def analyze_paper():
    """Analyze paper using Deepseek API"""
    try:
        # Get API key from header
        api_key = request.headers.get('X-API-Key')
        if not api_key:
            return jsonify({'error': 'API key is required'}), 401

        # Get analysis options
        extract_keywords = request.form.get('extract_keywords') == 'true'
        generate_summary = request.form.get('generate_summary') == 'true'
        find_related = request.form.get('find_related') == 'true'

        # Get uploaded file
        if 'file' not in request.files:
            return jsonify({'error': 'No file uploaded'}), 400

        file = request.files['file']
        if file.filename == '':
            return jsonify({'error': 'No file selected'}), 400

        # Check file size (limit to 50MB)
        file.seek(0, 2)  # Seek to end
        file_size = file.tell()
        file.seek(0)  # Reset to beginning

        if file_size > 50 * 1024 * 1024:  # 50MB limit
            return jsonify({'error': 'File too large. Maximum size is 50MB'}), 400

        if file_size == 0:
            return jsonify({'error': 'File is empty'}), 400

        # Save file temporarily
        filename = secure_filename(file.filename)
        print(f"Processing file: {filename} (size: {file_size} bytes)")

        with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(filename)[1]) as temp_file:
            file.save(temp_file.name)
            file_path = temp_file.name

        try:
            # Initialize results dictionary
            results = {
                'keywords': [],
                'summary': '',
                'related_works': []
            }

            # Extract text from file
            text = extract_text_from_file(file_path, filename)

            if not text or not text.strip():
                return jsonify({'error': 'Could not extract text from the file. The file might be empty or in an unsupported format.'}), 400

            # Clean up text
            text = text.strip()
            print(f"Final text length before truncation: {len(text)}")

            # Check if we have enough text
            if len(text) < 100:
                return jsonify({
                    'error': 'Extracted text is too short. This might be a scanned PDF without OCR text. Please ensure your PDF contains selectable text, not just images.'
                }), 400

            # Handle large documents
            text = process_large_document(text)

            # Call Deepseek API for each requested analysis
            headers = {
                'Authorization': f'Bearer {api_key}',
                'Content-Type': 'application/json'
            }

            if extract_keywords:
                results['keywords'] = extract_keywords_with_deepseek(text, headers)

            if generate_summary:
                results['summary'] = generate_summary_with_deepseek(text, headers)

            if find_related:
                results['related_works'] = extract_formulas_with_deepseek(text, headers)

            return jsonify(results)

        finally:
            # Clean up temporary file
            try:
                os.unlink(file_path)
            except Exception as e:
                print(f"Error deleting temporary file: {str(e)}")

    except Exception as e:
        print(f"Analyze paper error: {str(e)}")
        return jsonify({'error': str(e)}), 500

def extract_text_from_file(file_path, filename):
    """Extract text from various file formats"""
    text = ''
    file_ext = os.path.splitext(filename)[1].lower()

    try:
        if file_ext == '.pdf':
            text = extract_pdf_text(file_path)
        elif file_ext in ['.docx', '.doc']:
            text = extract_word_text(file_path, file_ext)
        elif file_ext == '.rtf':
            text = extract_rtf_text(file_path)
        elif file_ext in ['.tex', '.latex']:
            text = extract_latex_text(file_path)
        elif file_ext in ['.md', '.markdown']:
            text = extract_markdown_text(file_path)
        else:
            text = extract_plain_text(file_path)

    except Exception as e:
        print(f"File processing error: {str(e)}")
        raise Exception(f"Error reading file: {str(e)}")

    return text

def extract_pdf_text(file_path):
    """Extract text from PDF files"""
    try:
        from PyPDF2 import PdfReader
        reader = PdfReader(file_path)
        text = ''
        num_pages = len(reader.pages)
        print(f"PDF has {num_pages} pages")

        for i, page in enumerate(reader.pages):
            try:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + '\n'
                print(f"Extracted page {i+1}/{num_pages}")
            except Exception as page_error:
                print(f"Error extracting page {i+1}: {str(page_error)}")
                continue

        print(f"Total extracted text length: {len(text)}")
        return text

    except ImportError:
        # Try alternative PDF library
        try:
            import pdfplumber
            text = ''
            with pdfplumber.open(file_path) as pdf:
                for page in pdf.pages:
                    page_text = page.extract_text()
                    if page_text:
                        text += page_text + '\n'
            return text
        except ImportError:
            raise Exception('PDF processing is not available. Please install PyPDF2 or pdfplumber.')
    except Exception as pdf_error:
        print(f"PDF extraction error: {str(pdf_error)}")
        # Try PyMuPDF as fallback
        try:
            import fitz  # PyMuPDF
            pdf_document = fitz.open(file_path)
            text = ''
            for page_num in range(pdf_document.page_count):
                page = pdf_document[page_num]
                text += page.get_text() + '\n'
            pdf_document.close()
            return text
        except ImportError:
            raise Exception(f'Could not extract text from PDF: {str(pdf_error)}. Try installing PyMuPDF.')
        except Exception as mupdf_error:
            raise Exception(f'PDF extraction failed: {str(pdf_error)}')

def extract_word_text(file_path, file_ext):
    """Extract text from Word documents"""
    try:
        if file_ext == '.docx':
            from docx import Document
            doc = Document(file_path)
            return '\n'.join([paragraph.text for paragraph in doc.paragraphs])
        else:
            # .doc files
            try:
                import docx2txt
                return docx2txt.process(file_path)
            except ImportError:
                raise Exception('DOC file support requires docx2txt. Please install it with: pip install docx2txt')
    except ImportError:
        raise Exception('Word document support requires python-docx. Please install it with: pip install python-docx')
    except Exception as docx_error:
        raise Exception(f'Error reading Word document: {str(docx_error)}')

def extract_rtf_text(file_path):
    """Extract text from RTF files"""
    try:
        import striprtf
        with open(file_path, 'r', encoding='utf-8') as f:
            rtf_content = f.read()
        return striprtf.rtf_to_text(rtf_content)
    except ImportError:
        raise Exception('RTF support requires striprtf. Please install it with: pip install striprtf')
    except Exception as rtf_error:
        raise Exception(f'Error reading RTF file: {str(rtf_error)}')

def extract_latex_text(file_path):
    """Extract text from LaTeX files"""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            tex_content = f.read()
        # Basic LaTeX cleanup - remove common commands
        import re
        text = tex_content
        # Remove comments
        text = re.sub(r'%.*$', '', text, flags=re.MULTILINE)
        # Remove common LaTeX commands but keep content
        text = re.sub(r'\\(begin|end)\{[^}]+\}', '', text)
        text = re.sub(r'\\[a-zA-Z]+\*?\{([^}]+)\}', r'\1', text)
        text = re.sub(r'\\[a-zA-Z]+\*?', '', text)
        return text
    except Exception as tex_error:
        raise Exception(f'Error reading LaTeX file: {str(tex_error)}')

def extract_markdown_text(file_path):
    """Extract text from Markdown files"""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()
        # Clean up markdown syntax
        import re
        # Remove image links
        text = re.sub(r'!\[[^\]]*\]\([^)]+\)', '', text)
        # Convert links to just text
        text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text)
        return text
    except Exception as md_error:
        raise Exception(f'Error reading Markdown file: {str(md_error)}')

def extract_plain_text(file_path):
    """Extract text from plain text files"""
    encodings = ['utf-8', 'utf-16', 'gbk', 'gb2312', 'big5', 'latin-1']
    text = None

    for encoding in encodings:
        try:
            with open(file_path, 'r', encoding=encoding) as f:
                text = f.read()
                print(f"Successfully read file with {encoding} encoding")
                break
        except UnicodeDecodeError:
            continue
        except Exception as e:
            print(f"Error reading with {encoding}: {str(e)}")
            continue

    if text is None:
        # Try reading as binary and decode
        with open(file_path, 'rb') as f:
            binary_content = f.read()
            try:
                text = binary_content.decode('utf-8', errors='ignore')
            except:
                text = str(binary_content)

    return text

def process_large_document(text):
    """Process large documents by prioritizing formula extraction"""
    if len(text) > 98000:
        print("Large document detected, prioritizing content for formula extraction")
        # Try to find sections with formulas (common patterns)
        import re
        # Look for mathematical content indicators
        math_sections = []
        lines = text.split('\n')
        for i, line in enumerate(lines):
            if re.search(r'[=+\-*/∫∑∏√∂∇∆λμσπ]|equation|formula|theorem|lemma|proof', line, re.IGNORECASE):
                # Include surrounding context
                start = max(0, i-5)
                end = min(len(lines), i+6)
                math_sections.extend(lines[start:end])

        if math_sections:
            # Use math-rich sections for better formula extraction
            math_text = '\n'.join(math_sections)
            if len(math_text) > 50000:  # Still too long
                text = math_text[:98000]
            else:
                # Combine math sections with beginning of document
                remaining_space = 98000 - len(math_text)
                text = text[:remaining_space] + '\n\n[Mathematical content sections:]\n' + math_text
        else:
            # No math indicators found, use first part
            text = text[:98000]

    return text

def extract_keywords_with_deepseek(text, headers):
    """Extract keywords using Deepseek API"""
    try:
        keyword_messages = [
            {
                'role': 'system',
                'content': 'You are a helpful assistant that extracts keywords from academic papers. Always respond with valid JSON.'
            },
            {
                'role': 'user',
                'content': f"""Analyze the following academic paper and extract the key technical terms and concepts.
For each keyword, provide a relevance score between 0 and 1.
Return ONLY a valid JSON array of objects with 'text' and 'score' properties.
Example format: [{{"text": "machine learning", "score": 0.95}}, {{"text": "neural networks", "score": 0.85}}]

Paper text:
{text}"""
            }
        ]

        keyword_response = requests.post(
            'https://api.deepseek.com/v1/chat/completions',
            headers=headers,
            json={
                'model': 'deepseek-chat',
                'messages': keyword_messages,
                'temperature': 0.3,
                'max_tokens': 4000
            },
            timeout=60
        )

        if keyword_response.ok:
            response_content = keyword_response.json()['choices'][0]['message']['content']
            try:
                # Try to extract JSON from the response
                import re
                json_match = re.search(r'\[.*\]', response_content, re.DOTALL)
                if json_match:
                    return json.loads(json_match.group())
                else:
                    return json.loads(response_content)
            except json.JSONDecodeError:
                print(f"Invalid JSON from keywords API: {response_content}")
                return []
        else:
            print(f"Keywords API error: {keyword_response.text}")
            return []

    except Exception as e:
        print(f"Error in keywords extraction: {str(e)}")
        return []

def generate_summary_with_deepseek(text, headers):
    """Generate summary using Deepseek API"""
    try:
        summary_messages = [
            {
                'role': 'system',
                'content': 'You are a helpful assistant that summarizes academic papers.'
            },
            {
                'role': 'user',
                'content': f"""Provide a comprehensive summary of the following academic paper.
Focus on the main contributions, methodology, and key findings.
Keep the response concise and well-structured.

Paper text:
{text}"""
            }
        ]

        summary_response = requests.post(
            'https://api.deepseek.com/v1/chat/completions',
            headers=headers,
            json={
                'model': 'deepseek-chat',
                'messages': summary_messages,
                'temperature': 0.3,
                'max_tokens': 4000
            },
            timeout=60
        )

        if summary_response.ok:
            return summary_response.json()['choices'][0]['message']['content']
        else:
            print(f"Summary API error: {summary_response.text}")
            return "Error generating summary"

    except Exception as e:
        print(f"Error in summary generation: {str(e)}")
        return "Error generating summary"

def extract_formulas_with_deepseek(text, headers):
    """Extract formulas using Deepseek API"""
    try:
        related_messages = [
            {
                'role': 'system',
                'content': '''You are an expert mathematician and AI assistant specialized in extracting mathematical formulas from academic papers.
Your task is to identify and extract ALL mathematical formulas, equations, and mathematical expressions from the given text, try as much as you can.

IMPORTANT INSTRUCTIONS:
1. Extract EVERY mathematical formula, equation, or expression you find
2. Include inline formulas, displayed equations, and mathematical definitions
3. Preserve the original notation as much as possible
4. For each formula, provide context about what it represents
5. Always respond with valid JSON format

You must be thorough and extract ALL formulas, not just the main ones.'''
            },
            {
                'role': 'user',
                'content': f"""Extract ALL mathematical formulas and equations from the following paper text.

For each formula found, provide:
- The formula itself (in LaTeX notation if possible)
- A detailed description explaining what the formula represents and what each variable means
- The context or section where it appears
- Whether it's a definition, theorem, lemma, or general equation
- A Chinese description that explains the formula's purpose

Return a JSON array where each element has these properties:
- "formula": The mathematical expression (use LaTeX notation)
- "description": What the formula represents or calculates
- "variables": Detailed explanation of what each variable/symbol means in the formula
- "variables_chinese": Chinese translation of variable explanations (same structure as variables)
- "type": One of ["definition", "theorem", "lemma", "equation", "inequality", "identity", "other"]
- "context": Brief context about where/how it's used
- "chinese_description": A comprehensive Chinese description of the formula and its purpose

Example format:
[
  {{
    "formula": "E = mc^2",
    "description": "Einstein's mass-energy equivalence relation",
    "variables": {{"E": "energy (joules)", "m": "mass (kilograms)", "c": "speed of light in vacuum (≈3×10^8 m/s)"}},
    "variables_chinese": {{"E": "能量 (焦耳)", "m": "质量 (千克)", "c": "真空中的光速 (≈3×10^8 m/s)"}},
    "type": "equation",
    "context": "Fundamental equation in special relativity theory",
    "chinese_description": "爱因斯坦质能等价公式，表示质量和能量之间的等价关系"
  }},
  {{
    "formula": "F = ma",
    "description": "Newton's second law of motion",
    "variables": {{"F": "net force (newtons)", "m": "mass (kilograms)", "a": "acceleration (m/s²)"}},
    "variables_chinese": {{"F": "净力 (牛顿)", "m": "质量 (千克)", "a": "加速度 (m/s²)"}},
    "type": "equation",
    "context": "Classical mechanics fundamental law",
    "chinese_description": "牛顿第二定律，描述物体受力与加速度的关系"
  }}
]

Paper text:
{text}

IMPORTANT INSTRUCTIONS:
1. Extract EVERY formula, even simple ones like "x + y = z" or "f(x) = ax + b"
2. For each variable or symbol in the formula, explain what it represents
3. Include units of measurement when relevant
4. Provide comprehensive Chinese descriptions that explain the formula's significance
5. Be thorough and detailed in variable explanations"""
            }
        ]

        related_response = requests.post(
            'https://api.deepseek.com/v1/chat/completions',
            headers=headers,
            json={
                'model': 'deepseek-chat',
                'messages': related_messages,
                'temperature': 0.1,  # Lower temperature for more consistent extraction
                'max_tokens': 4000   # Increased token limit for more formulas
            },
            timeout=120  # Increased timeout for large documents
        )

        if related_response.ok:
            response_content = related_response.json()['choices'][0]['message']['content']
            try:
                # Try to extract JSON from the response
                import re
                # Look for JSON array in the response
                json_match = re.search(r'\[[\s\S]*\]', response_content)
                if json_match:
                    formulas = json.loads(json_match.group())
                    return formulas
                else:
                    # Try direct JSON parsing
                    return json.loads(response_content)
            except json.JSONDecodeError as e:
                print(f"Invalid JSON from formulas API: {response_content}")
                print(f"JSON Error: {str(e)}")
                return []
        else:
            print(f"Formulas API error: {related_response.text}")
            return []

    except Exception as e:
        print(f"Error in formula extraction: {str(e)}")
        return []