igny8/scripts/extract_ai_elements.py

#!/usr/bin/env python3
"""
IGNY8 AI Data Mapping Script
Extracts complete reference table for all AI-related elements (functions, models, prompts, limits, calls)
to eliminate assumptions during restructuring.

Output: Markdown table with all AI Elements for cluster, idea, content, image
"""
import os
import re
import json
import ast
from pathlib import Path
from typing import Dict, List, Any, Optional

# Project root (assuming script is in scripts/ directory)
PROJECT_ROOT = Path(__file__).parent.parent
BACKEND_ROOT = PROJECT_ROOT / "backend" / "igny8_core"


def extract_function_info(file_path: Path, function_name: str) -> Dict[str, Any]:
    """Extract information about a function from a Python file"""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
    except Exception as e:
        return {"error": f"Could not read file: {e}"}

    # Try to parse AST
    try:
        tree = ast.parse(content)
    except SyntaxError:
        return {"error": "Syntax error in file"}

    info = {
        "file": str(file_path.relative_to(PROJECT_ROOT)),
        "function_name": function_name,
        "found": False,
        "line_number": None,
        "uses_ai_processor": False,
        "uses_celery": False,
        "has_progress_callback": False,
        "has_request_steps": False,
        "has_response_steps": False,
        "prompt_source": "Unknown",
        "model_source": "Unknown",
        "validation_checks": [],
        "limit_checks": [],
    }

    # Search for function definition
    for node in ast.walk(tree):
        if isinstance(node, ast.FunctionDef) and node.name == function_name:
            info["found"] = True
            info["line_number"] = node.lineno

            # Check function body for patterns
            func_content = ast.get_source_segment(content, node) or ""

            # Check for AIProcessor usage
            if "AIProcessor" in func_content or "ai_processor" in func_content:
                info["uses_ai_processor"] = True

            # Check for Celery
            if "self.request" in func_content or "@shared_task" in content[:node.lineno * 100]:
                info["uses_celery"] = True

            # Check for progress tracking
            if "progress_callback" in func_content or "progress_tracker" in func_content:
                info["has_progress_callback"] = True

            # Check for step tracking
            if "request_steps" in func_content:
                info["has_request_steps"] = True
            if "response_steps" in func_content:
                info["has_response_steps"] = True

            # Check for prompt sources
            if "get_prompt_value" in func_content:
                info["prompt_source"] = "Database (get_prompt_value)"
            elif "get_default_prompt" in func_content:
                info["prompt_source"] = "Default (get_default_prompt)"
            elif "prompt_template" in func_content.lower():
                info["prompt_source"] = "Inline/Hardcoded"

            # Check for model selection
            if "default_model" in func_content or "self.default_model" in func_content:
                info["model_source"] = "AIProcessor.default_model"
            elif "get_model" in func_content:
                info["model_source"] = "Function.get_model()"
            elif "IntegrationSettings" in func_content:
                info["model_source"] = "IntegrationSettings.config['model']"

            # Check for validation
            if "validate" in func_content.lower():
                info["validation_checks"].append("Has validate() call")
            if "check_credits" in func_content:
                info["limit_checks"].append("Credit check")
            if "daily_cluster_limit" in func_content or "max_clusters" in func_content:
                info["limit_checks"].append("Plan limits")

            break

    return info


def extract_class_info(file_path: Path, class_name: str) -> Dict[str, Any]:
    """Extract information about a class from a Python file"""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
    except Exception as e:
        return {"error": f"Could not read file: {e}"}

    try:
        tree = ast.parse(content)
    except SyntaxError:
        return {"error": "Syntax error in file"}

    info = {
        "file": str(file_path.relative_to(PROJECT_ROOT)),
        "class_name": class_name,
        "found": False,
        "line_number": None,
        "methods": [],
        "inherits_from": [],
    }

    for node in ast.walk(tree):
        if isinstance(node, ast.ClassDef) and node.name == class_name:
            info["found"] = True
            info["line_number"] = node.lineno

            # Get base classes
            for base in node.bases:
                if isinstance(base, ast.Name):
                    info["inherits_from"].append(base.id)

            # Get methods
            for item in node.body:
                if isinstance(item, ast.FunctionDef):
                    info["methods"].append(item.name)

            break

    return info


def find_ai_functions() -> List[Dict[str, Any]]:
    """Find all AI-related functions in the codebase"""
    functions = []

    # Define AI functions to search for
    ai_function_definitions = [
        {
            "name": "_auto_cluster_keywords_core",
            "file": BACKEND_ROOT / "modules" / "planner" / "tasks.py",
            "type": "core_function",
            "category": "cluster"
        },
        {
            "name": "_generate_single_idea_core",
            "file": BACKEND_ROOT / "modules" / "planner" / "tasks.py",
            "type": "core_function",
            "category": "ideas"
        },
        {
            "name": "auto_generate_content_task",
            "file": BACKEND_ROOT / "modules" / "writer" / "tasks.py",
            "type": "celery_task",
            "category": "content"
        },
        {
            "name": "AutoClusterFunction",
            "file": BACKEND_ROOT / "ai" / "functions" / "auto_cluster.py",
            "type": "class",
            "category": "cluster"
        },
        {
            "name": "cluster_keywords",
            "file": BACKEND_ROOT / "utils" / "ai_processor.py",
            "type": "method",
            "category": "cluster"
        },
        {
            "name": "generate_ideas",
            "file": BACKEND_ROOT / "utils" / "ai_processor.py",
            "type": "method",
            "category": "ideas"
        },
        {
            "name": "generate_content",
            "file": BACKEND_ROOT / "utils" / "ai_processor.py",
            "type": "method",
            "category": "content"
        },
        {
            "name": "generate_image",
            "file": BACKEND_ROOT / "utils" / "ai_processor.py",
            "type": "method",
            "category": "image"
        },
        {
            "name": "run_ai_task",
            "file": BACKEND_ROOT / "ai" / "tasks.py",
            "type": "celery_task",
            "category": "unified"
        },
        {
            "name": "execute",
            "file": BACKEND_ROOT / "ai" / "engine.py",
            "type": "method",
            "category": "unified"
        },
    ]

    for func_def in ai_function_definitions:
        file_path = func_def["file"]
        if not file_path.exists():
            continue

        if func_def["type"] == "class":
            info = extract_class_info(file_path, func_def["name"])
        else:
            info = extract_function_info(file_path, func_def["name"])

        info.update({
            "type": func_def["type"],
            "category": func_def["category"]
        })

        functions.append(info)

    return functions


def extract_prompt_info() -> List[Dict[str, Any]]:
    """Extract prompt information"""
    prompts = []

    utils_file = BACKEND_ROOT / "modules" / "system" / "utils.py"
    if utils_file.exists():
        with open(utils_file, 'r', encoding='utf-8') as f:
            content = f.read()

        # Find prompt types in get_default_prompt
        prompt_types = re.findall(r"'(\w+)':\s*\"\"\"", content)
        for prompt_type in prompt_types:
            prompts.append({
                "prompt_type": prompt_type,
                "source": "Hardcoded in get_default_prompt()",
                "file": "modules/system/utils.py",
                "retrieval": "get_prompt_value() -> AIPrompt model or default"
            })

    return prompts


def extract_model_info() -> List[Dict[str, Any]]:
    """Extract model configuration information"""
    models = []

    processor_file = BACKEND_ROOT / "utils" / "ai_processor.py"
    if processor_file.exists():
        with open(processor_file, 'r', encoding='utf-8') as f:
            content = f.read()

        # Find MODEL_RATES
        model_rates_match = re.search(r'MODEL_RATES\s*=\s*\{([^}]+)\}', content, re.DOTALL)
        if model_rates_match:
            models_text = model_rates_match.group(1)
            model_names = re.findall(r"'([^']+)'", models_text)
            for model in model_names:
                models.append({
                    "model_name": model,
                    "source": "MODEL_RATES constant",
                    "file": "utils/ai_processor.py",
                    "selection": "AIProcessor._get_model() -> IntegrationSettings or Django settings"
                })

    return models


def generate_markdown_table(functions: List[Dict], prompts: List[Dict], models: List[Dict]) -> str:
    """Generate markdown table from extracted data"""
    output = []
    output.append("# IGNY8 AI Elements Reference Table\n")
    output.append("Generated by extract_ai_elements.py\n")
    output.append("---\n\n")

    # Functions table
    output.append("## 🧠 AI Core Functions\n\n")
    output.append("| Function Name | Category | Type | File | Line | Uses AIProcessor | Celery | Progress | Steps | Prompt Source | Model Source |\n")
    output.append("|---------------|----------|------|------|------|------------------|--------|----------|-------|---------------|--------------|\n")

    for func in functions:
        if func.get("error"):
            continue

        name = func.get("function_name") or func.get("class_name", "N/A")
        category = func.get("category", "N/A")
        func_type = func.get("type", "N/A")
        file = func.get("file", "N/A")
        line = str(func.get("line_number", "N/A"))
        uses_ai = "✅" if func.get("uses_ai_processor") else "❌"
        celery = "✅" if func.get("uses_celery") else "❌"
        progress = "✅" if func.get("has_progress_callback") else "❌"
        steps = "✅" if (func.get("has_request_steps") or func.get("has_response_steps")) else "❌"
        prompt = func.get("prompt_source", "Unknown")
        model = func.get("model_source", "Unknown")

        output.append(f"| {name} | {category} | {func_type} | `{file}` | {line} | {uses_ai} | {celery} | {progress} | {steps} | {prompt} | {model} |\n")

    # Prompts table
    output.append("\n## 🧱 Prompt Sources\n\n")
    output.append("| Prompt Type | Source | File | Retrieval Method |\n")
    output.append("|-------------|--------|------|------------------|\n")

    for prompt in prompts:
        output.append(f"| {prompt['prompt_type']} | {prompt['source']} | `{prompt['file']}` | {prompt['retrieval']} |\n")

    # Models table
    output.append("\n## 🧾 Model Configuration\n\n")
    output.append("| Model Name | Source | File | Selection Method |\n")
    output.append("|------------|--------|------|------------------|\n")

    for model in models:
        output.append(f"| {model['model_name']} | {model['source']} | `{model['file']}` | {model['selection']} |\n")

    # Validation and Limits
    output.append("\n## ⚠️ Validation & Limits\n\n")
    output.append("| Function | Validation Checks | Limit Checks |\n")
    output.append("|----------|-------------------|--------------|\n")

    for func in functions:
        if func.get("error") or not func.get("found"):
            continue

        name = func.get("function_name") or func.get("class_name", "N/A")
        validations = ", ".join(func.get("validation_checks", [])) or "None"
        limits = ", ".join(func.get("limit_checks", [])) or "None"

        output.append(f"| {name} | {validations} | {limits} |\n")

    return "".join(output)


def main():
    """Main execution"""
    print("🔍 Extracting AI elements from codebase...")

    functions = find_ai_functions()
    prompts = extract_prompt_info()
    models = extract_model_info()

    print(f"✅ Found {len(functions)} functions")
    print(f"✅ Found {len(prompts)} prompt types")
    print(f"✅ Found {len(models)} models")

    # Generate markdown
    markdown = generate_markdown_table(functions, prompts, models)

    # Save to file
    output_file = PROJECT_ROOT / "docs" / "ActiveDocs" / "AI-ELEMENTS-EXTRACTED.md"
    output_file.parent.mkdir(parents=True, exist_ok=True)

    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(markdown)

    print(f"\n✅ Table saved to: {output_file.relative_to(PROJECT_ROOT)}")

    # Also print to console
    print("\n" + "="*80)
    print(markdown)
    print("="*80)


if __name__ == "__main__":
    main()