igny8/backend/scripts/import_seed_keywords_single.py

#!/usr/bin/env python3
"""
Import Seed Keywords from Single CSV File

This script imports keywords from a single CSV file into the IGNY8 global keywords database.
Use this for testing before running full import.

DUPLICATE HANDLING:
- Checks: keyword + country (case-insensitive)
- If duplicate exists in same industry+sector: SKIPS import

Usage:
    docker compose -f docker-compose.app.yml exec igny8_backend \\
        python3 /app/scripts/import_seed_keywords_single.py \\
        --csv /app/../KW_DB/HealthCare_Medical/Physiotherapy_Rehabilitation/google_us_physical-therapy_matching-terms_2025-12-19_04-25-15.csv \\
        --industry "HealthCare Medical" \\
        --sector "Physiotherapy Rehabilitation" \\
        --dry-run

Author: IGNY8 Team
Date: January 13, 2026
"""

import os
import sys
import csv
import argparse
import django
from pathlib import Path

# Change to app directory for Django imports
sys.path.insert(0, '/app')
os.chdir('/app')

# Setup Django
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'igny8_core.settings')
django.setup()

from django.utils.text import slugify
from django.db import transaction
from igny8_core.auth.models import Industry, IndustrySector, SeedKeyword


class KeywordImporter:
    """Import keywords from CSV into database"""

    def __init__(self, dry_run=False, verbose=False):
        self.dry_run = dry_run
        self.verbose = verbose
        self.stats = {
            'processed': 0,
            'imported': 0,
            'skipped_duplicate': 0,
            'skipped_invalid': 0,
            'errors': 0
        }

    def log(self, message, force=False):
        """Print message if verbose or forced"""
        if self.verbose or force:
            print(message)

    def get_or_create_industry(self, name):
        """Get or create Industry record"""
        slug = slugify(name)

        if self.dry_run:
            self.log(f"[DRY RUN] Would get/create Industry: {name} (slug: {slug})")
            # Return a mock object for dry run
            class MockIndustry:
                def __init__(self):
                    self.id = 0
                    self.name = name
                    self.slug = slug
            return MockIndustry(), False

        industry, created = Industry.objects.get_or_create(
            slug=slug,
            defaults={
                'name': name,
                'is_active': True,
                'description': f'Auto-imported from KW_DB'
            }
        )

        if created:
            self.log(f"✓ Created Industry: {name}", force=True)
        else:
            self.log(f"✓ Found existing Industry: {name}")

        return industry, created

    def get_or_create_sector(self, industry, name):
        """Get or create IndustrySector record"""
        slug = slugify(name)

        if self.dry_run:
            self.log(f"[DRY RUN] Would get/create Sector: {name} (slug: {slug})")
            class MockSector:
                def __init__(self):
                    self.id = 0
                    self.name = name
                    self.slug = slug
            return MockSector(), False

        sector, created = IndustrySector.objects.get_or_create(
            industry=industry,
            slug=slug,
            defaults={
                'name': name,
                'is_active': True,
                'description': f'Auto-imported from KW_DB'
            }
        )

        if created:
            self.log(f"  ✓ Created Sector: {name}", force=True)
        else:
            self.log(f"  ✓ Found existing Sector: {name}")

        return sector, created

    def is_duplicate(self, keyword, country, industry, sector):
        """
        Check if keyword already exists with same country in this industry+sector.
        Duplicate check: keyword + country (case-insensitive)
        """
        if self.dry_run:
            return False  # Skip duplicate check in dry run

        exists = SeedKeyword.objects.filter(
            keyword__iexact=keyword,
            country=country,
            industry=industry,
            sector=sector
        ).exists()

        return exists

    def import_keyword(self, keyword_data, industry, sector):
        """Import single keyword record"""
        keyword = keyword_data['keyword']
        country = keyword_data['country']
        volume = keyword_data['volume']
        difficulty = keyword_data['difficulty']

        # Check for duplicate (keyword + country)
        if self.is_duplicate(keyword, country, industry, sector):
            self.log(f"  ⊘ SKIP (duplicate): {keyword} [{country}]")
            self.stats['skipped_duplicate'] += 1
            return False

        if self.dry_run:
            self.log(f"  [DRY RUN] Would import: {keyword} [{country}] (vol:{volume}, diff:{difficulty})")
            return True

        # Create keyword
        try:
            SeedKeyword.objects.create(
                keyword=keyword,
                industry=industry,
                sector=sector,
                volume=volume,
                difficulty=difficulty,
                country=country,
                is_active=True
            )
        except Exception as e:
            self.log(f"  ⚠ Error creating: {keyword} [{country}]: {e}", force=True)
            self.stats['errors'] += 1
            return False

        self.log(f"  ✓ Imported: {keyword} [{country}] (vol:{volume}, diff:{difficulty})")
        return True

    def parse_csv_row(self, row):
        """Parse CSV row and extract keyword data"""
        try:
            keyword = row.get('Keyword', '').strip()
            if not keyword:
                return None

            # Parse country (default to US)
            country_raw = row.get('Country', 'us').strip().upper()
            if not country_raw:
                country_raw = 'US'

            # Parse volume (default to 0)
            volume_raw = row.get('Volume', '0').strip()
            try:
                volume = int(volume_raw) if volume_raw else 0
            except (ValueError, TypeError):
                volume = 0

            # Parse difficulty (default to 0, clamp to 0-100)
            difficulty_raw = row.get('Difficulty', '0').strip()
            try:
                difficulty = int(difficulty_raw) if difficulty_raw else 0
                difficulty = max(0, min(100, difficulty))  # Clamp to 0-100
            except (ValueError, TypeError):
                difficulty = 0

            return {
                'keyword': keyword,
                'country': country_raw,
                'volume': volume,
                'difficulty': difficulty
            }

        except Exception as e:
            self.log(f"  ⚠ Error parsing row: {e}")
            return None

    def import_csv(self, csv_path, industry_name, sector_name):
        """Import keywords from CSV file"""
        csv_path = Path(csv_path)

        if not csv_path.exists():
            print(f"❌ ERROR: CSV file not found: {csv_path}")
            return False

        print(f"\n{'='*70}")
        print(f"IMPORTING SEED KEYWORDS FROM CSV")
        print(f"{'='*70}")
        print(f"File:     {csv_path.name}")
        print(f"Industry: {industry_name}")
        print(f"Sector:   {sector_name}")
        if self.dry_run:
            print("Mode:     DRY RUN (no database changes)")
        print(f"{'='*70}\n")

        # Get or create Industry and Sector
        industry, _ = self.get_or_create_industry(industry_name)
        sector, _ = self.get_or_create_sector(industry, sector_name)

        # Read and import CSV
        print(f"Processing keywords...\n")

        try:
            with transaction.atomic():
                with open(csv_path, 'r', encoding='utf-8') as f:
                    reader = csv.DictReader(f)

                    for row in reader:
                        self.stats['processed'] += 1

                        keyword_data = self.parse_csv_row(row)
                        if not keyword_data:
                            self.stats['skipped_invalid'] += 1
                            continue

                        if self.import_keyword(keyword_data, industry, sector):
                            self.stats['imported'] += 1

                # Rollback in dry run mode
                if self.dry_run:
                    transaction.set_rollback(True)

        except Exception as e:
            print(f"\n❌ ERROR: {e}")
            import traceback
            traceback.print_exc()
            self.stats['errors'] += 1
            return False

        # Print summary
        print(f"\n{'='*70}")
        print(f"IMPORT SUMMARY")
        print(f"{'='*70}")
        print(f"Total rows processed:  {self.stats['processed']}")
        print(f"✓ Imported:            {self.stats['imported']}")
        print(f"⊘ Skipped (duplicate): {self.stats['skipped_duplicate']}")
        print(f"⊘ Skipped (invalid):   {self.stats['skipped_invalid']}")
        print(f"✗ Errors:              {self.stats['errors']}")
        print(f"{'='*70}\n")

        if self.dry_run:
            print("ℹ This was a DRY RUN - no data was saved to database")
            print("Remove --dry-run flag to perform actual import\n")
        else:
            print("✓ Import completed successfully!")
            print(f"✓ Check Django admin: /admin/auth/seedkeyword/\n")

        return True


def main():
    parser = argparse.ArgumentParser(
        description='Import seed keywords from single CSV file (with duplicate check: keyword+country)',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  # Dry run (preview only)
  docker compose -f docker-compose.app.yml exec igny8_backend \\
    python3 /app/scripts/import_seed_keywords_single.py \\
    --csv /app/../KW_DB/HealthCare_Medical/Physiotherapy_Rehabilitation/google_us_muscle-stimulator_matching-terms_2025-12-19_04-25-32.csv \\
    --industry "HealthCare Medical" \\
    --sector "Physiotherapy Rehabilitation" \\
    --dry-run --verbose

  # Actual import
  docker compose -f docker-compose.app.yml exec igny8_backend \\
    python3 /app/scripts/import_seed_keywords_single.py \\
    --csv /app/../KW_DB/HealthCare_Medical/Physiotherapy_Rehabilitation/google_us_muscle-stimulator_matching-terms_2025-12-19_04-25-32.csv \\
    --industry "HealthCare Medical" \\
    --sector "Physiotherapy Rehabilitation"
        """
    )

    parser.add_argument('--csv', required=True, help='Path to CSV file')
    parser.add_argument('--industry', required=True, help='Industry name')
    parser.add_argument('--sector', required=True, help='Sector name')
    parser.add_argument('--dry-run', action='store_true', help='Preview without saving to database')
    parser.add_argument('--verbose', action='store_true', help='Show detailed progress')

    args = parser.parse_args()

    # Create importer and run
    importer = KeywordImporter(dry_run=args.dry_run, verbose=args.verbose)
    success = importer.import_csv(args.csv, args.industry, args.sector)

    sys.exit(0 if success else 1)


if __name__ == '__main__':
    main()