igny8/backend/scripts/import_all_seed_keywords.py

#!/usr/bin/env python3
"""
Import All Seed Keywords from KW_DB Folder Structure

This script automatically scans the KW_DB folder structure and imports all CSV files.
It extracts Industry and Sector from folder names and imports keywords with duplicate checking.

FOLDER STRUCTURE EXPECTED:
/data/app/igny8/KW_DB/
    {Industry}/
        {Sector}/
            *.csv files

DUPLICATE HANDLING:
- Checks: keyword + country (case-insensitive)
- If duplicate exists in same industry+sector: SKIPS import

Usage:
    # Dry run (preview all imports)
    docker compose -f docker-compose.app.yml exec igny8_backend \\
        python3 /app/scripts/import_all_seed_keywords.py \\
        --base-path /data/app/igny8/KW_DB \\
        --dry-run --verbose

    # Actual import
    docker compose -f docker-compose.app.yml exec igny8_backend \\
        python3 /app/scripts/import_all_seed_keywords.py \\
        --base-path /data/app/igny8/KW_DB

Author: IGNY8 Team
Date: January 13, 2026
"""

import os
import sys
import csv
import argparse
import django
from pathlib import Path

# Change to app directory for Django imports
sys.path.insert(0, '/app')
os.chdir('/app')

# Setup Django
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'igny8_core.settings')
django.setup()

from django.utils.text import slugify
from django.db import transaction
from igny8_core.auth.models import Industry, IndustrySector, SeedKeyword


class BulkKeywordImporter:
    """Import keywords from entire folder structure"""

    def __init__(self, base_path, dry_run=False, verbose=False):
        self.base_path = Path(base_path)
        self.dry_run = dry_run
        self.verbose = verbose
        self.global_stats = {
            'total_files': 0,
            'total_processed': 0,
            'total_imported': 0,
            'total_skipped_duplicate': 0,
            'total_skipped_invalid': 0,
            'total_errors': 0,
            'industries_created': 0,
            'sectors_created': 0
        }
        self.file_stats = []

    def log(self, message, force=False):
        """Print message if verbose or forced"""
        if self.verbose or force:
            print(message)

    def get_or_create_industry(self, name):
        """Get or create Industry record"""
        slug = slugify(name)

        if self.dry_run:
            self.log(f"[DRY RUN] Would get/create Industry: {name} (slug: {slug})")
            class MockIndustry:
                def __init__(self):
                    self.id = 0
                    self.name = name
                    self.slug = slug
            return MockIndustry(), False

        industry, created = Industry.objects.get_or_create(
            slug=slug,
            defaults={
                'name': name,
                'is_active': True,
                'description': f'Auto-imported from KW_DB'
            }
        )

        if created:
            self.log(f"✓ Created Industry: {name}", force=True)
            self.global_stats['industries_created'] += 1
        else:
            self.log(f"✓ Found existing Industry: {name}")

        return industry, created

    def get_or_create_sector(self, industry, name):
        """Get or create IndustrySector record"""
        slug = slugify(name)

        if self.dry_run:
            self.log(f"[DRY RUN] Would get/create Sector: {name} (slug: {slug})")
            class MockSector:
                def __init__(self):
                    self.id = 0
                    self.name = name
                    self.slug = slug
            return MockSector(), False

        sector, created = IndustrySector.objects.get_or_create(
            industry=industry,
            slug=slug,
            defaults={
                'name': name,
                'is_active': True,
                'description': f'Auto-imported from KW_DB'
            }
        )

        if created:
            self.log(f"  ✓ Created Sector: {name}", force=True)
            self.global_stats['sectors_created'] += 1
        else:
            self.log(f"  ✓ Found existing Sector: {name}")

        return sector, created

    def is_duplicate(self, keyword, country, industry, sector):
        """
        Check if keyword already exists with same country in this industry+sector.
        Duplicate check: keyword + country (case-insensitive)
        """
        if self.dry_run:
            return False  # Skip duplicate check in dry run

        exists = SeedKeyword.objects.filter(
            keyword__iexact=keyword,
            country=country,
            industry=industry,
            sector=sector
        ).exists()

        return exists

    def parse_csv_row(self, row):
        """Parse CSV row and extract keyword data"""
        try:
            keyword = row.get('Keyword', '').strip()
            if not keyword:
                return None

            # Parse country (default to US)
            country_raw = row.get('Country', 'us').strip().upper()
            if not country_raw:
                country_raw = 'US'

            # Parse volume (default to 0)
            volume_raw = row.get('Volume', '0').strip()
            try:
                volume = int(volume_raw) if volume_raw else 0
            except (ValueError, TypeError):
                volume = 0

            # Parse difficulty (default to 0, clamp to 0-100)
            difficulty_raw = row.get('Difficulty', '0').strip()
            try:
                difficulty = int(difficulty_raw) if difficulty_raw else 0
                difficulty = max(0, min(100, difficulty))  # Clamp to 0-100
            except (ValueError, TypeError):
                difficulty = 0

            return {
                'keyword': keyword,
                'country': country_raw,
                'volume': volume,
                'difficulty': difficulty
            }

        except Exception as e:
            self.log(f"  ⚠ Error parsing row: {e}")
            return None

    def import_csv_file(self, csv_path, industry, sector):
        """Import keywords from single CSV file"""
        file_stats = {
            'file': csv_path.name,
            'processed': 0,
            'imported': 0,
            'skipped_duplicate': 0,
            'skipped_invalid': 0,
            'errors': 0
        }

        try:
            with open(csv_path, 'r', encoding='utf-8') as f:
                reader = csv.DictReader(f)

                for row in reader:
                    file_stats['processed'] += 1
                    self.global_stats['total_processed'] += 1

                    keyword_data = self.parse_csv_row(row)
                    if not keyword_data:
                        file_stats['skipped_invalid'] += 1
                        self.global_stats['total_skipped_invalid'] += 1
                        continue

                    keyword = keyword_data['keyword']
                    country = keyword_data['country']
                    volume = keyword_data['volume']
                    difficulty = keyword_data['difficulty']

                    # Check for duplicate (keyword + country)
                    if self.is_duplicate(keyword, country, industry, sector):
                        self.log(f"    ⊘ SKIP (duplicate): {keyword} [{country}]")
                        file_stats['skipped_duplicate'] += 1
                        self.global_stats['total_skipped_duplicate'] += 1
                        continue

                    if self.dry_run:
                        self.log(f"    [DRY RUN] Would import: {keyword} [{country}] (vol:{volume}, diff:{difficulty})")
                    else:
                        # Create keyword
                        SeedKeyword.objects.create(
                            keyword=keyword,
                            industry=industry,
                            sector=sector,
                            volume=volume,
                            difficulty=difficulty,
                            country=country,
                            is_active=True
                        )
                        self.log(f"    ✓ Imported: {keyword} [{country}]")

                    file_stats['imported'] += 1
                    self.global_stats['total_imported'] += 1

        except Exception as e:
            self.log(f"    ❌ Error processing file: {e}", force=True)
            file_stats['errors'] += 1
            self.global_stats['total_errors'] += 1

        return file_stats

    def scan_and_import(self):
        """Scan folder structure and import all CSV files"""

        if not self.base_path.exists():
            print(f"❌ ERROR: Base path not found: {self.base_path}")
            return False

        print(f"\n{'='*70}")
        print(f"BULK IMPORT: ALL SEED KEYWORDS FROM KW_DB")
        print(f"{'='*70}")
        print(f"Base Path: {self.base_path}")
        if self.dry_run:
            print("Mode:      DRY RUN (no database changes)")
        print(f"{'='*70}\n")

        # Scan directory structure: Industry / Sector / *.csv
        for industry_dir in sorted(self.base_path.iterdir()):
            if not industry_dir.is_dir():
                continue

            industry_name = industry_dir.name.replace('_', ' ')

            print(f"\n📁 Industry: {industry_name}")
            print(f"{'─'*70}")

            industry, _ = self.get_or_create_industry(industry_name)

            for sector_dir in sorted(industry_dir.iterdir()):
                if not sector_dir.is_dir():
                    continue

                sector_name = sector_dir.name.replace('_', ' ')

                print(f"\n  📂 Sector: {sector_name}")

                sector, _ = self.get_or_create_sector(industry, sector_name)

                # Find all CSV files
                csv_files = list(sector_dir.glob('*.csv'))

                if not csv_files:
                    print(f"    ⚠ No CSV files found")
                    continue

                print(f"    Found {len(csv_files)} CSV files")

                # Import each CSV file
                with transaction.atomic():
                    for csv_file in sorted(csv_files):
                        self.global_stats['total_files'] += 1
                        print(f"\n    📄 {csv_file.name}")

                        file_stats = self.import_csv_file(csv_file, industry, sector)
                        self.file_stats.append(file_stats)

                        print(f"       ✓ {file_stats['imported']} imported, "
                              f"⊘ {file_stats['skipped_duplicate']} duplicates, "
                              f"⊘ {file_stats['skipped_invalid']} invalid")

                    # Rollback in dry run mode
                    if self.dry_run:
                        transaction.set_rollback(True)

        # Print summary
        print(f"\n\n{'='*70}")
        print(f"GLOBAL IMPORT SUMMARY")
        print(f"{'='*70}")
        print(f"Total CSV files:           {self.global_stats['total_files']}")
        print(f"Industries created:        {self.global_stats['industries_created']}")
        print(f"Sectors created:           {self.global_stats['sectors_created']}")
        print(f"─────────────────────────────────────────────────────────────────────")
        print(f"Total rows processed:      {self.global_stats['total_processed']}")
        print(f"✓ Total imported:          {self.global_stats['total_imported']}")
        print(f"⊘ Skipped (duplicate):     {self.global_stats['total_skipped_duplicate']}")
        print(f"⊘ Skipped (invalid):       {self.global_stats['total_skipped_invalid']}")
        print(f"✗ Total errors:            {self.global_stats['total_errors']}")
        print(f"{'='*70}\n")

        if self.dry_run:
            print("ℹ This was a DRY RUN - no data was saved to database")
            print("Remove --dry-run flag to perform actual import\n")
        else:
            print("✓ Import completed successfully!")
            print(f"✓ Check Django admin: /admin/auth/seedkeyword/\n")

        return True


def main():
    parser = argparse.ArgumentParser(
        description='Import all seed keywords from KW_DB folder structure',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  # Dry run (preview all imports)
  docker compose -f docker-compose.app.yml exec igny8_backend \\
    python3 /app/scripts/import_all_seed_keywords.py \\
    --base-path /data/app/igny8/KW_DB \\
    --dry-run --verbose

  # Actual import
  docker compose -f docker-compose.app.yml exec igny8_backend \\
    python3 /app/scripts/import_all_seed_keywords.py \\
    --base-path /data/app/igny8/KW_DB

Folder Structure Expected:
  /data/app/igny8/KW_DB/
      HealthCare_Medical/
          Physiotherapy_Rehabilitation/
              google_us_physical-therapy*.csv
              google_us_muscle-stimulator*.csv
              ...
        """
    )

    parser.add_argument('--base-path', default='/data/app/igny8/KW_DB',
                        help='Base path to KW_DB folder (default: /data/app/igny8/KW_DB)')
    parser.add_argument('--dry-run', action='store_true',
                        help='Preview without saving to database')
    parser.add_argument('--verbose', action='store_true',
                        help='Show detailed progress for each keyword')

    args = parser.parse_args()

    # Create importer and run
    importer = BulkKeywordImporter(args.base_path, dry_run=args.dry_run, verbose=args.verbose)
    success = importer.scan_and_import()

    sys.exit(0 if success else 1)


if __name__ == '__main__':
    main()