igny8/backend/scripts/import_us_ca_tech.py

#!/usr/bin/env python3
"""
Import US/CA Technology & IT Services Keywords

Targeted import script for the "US CA" folder in Technology_&_IT_Services.
Maps folder names to existing DB sectors and imports all CSV files.

Usage:
    docker exec igny8_backend python3 /app/scripts/import_us_ca_tech.py --dry-run
    docker exec igny8_backend python3 /app/scripts/import_us_ca_tech.py
"""

import os
import sys
import csv
import argparse

sys.path.insert(0, '/app')
os.chdir('/app')
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'igny8_core.settings')

import django
django.setup()

from pathlib import Path
from django.db import transaction
from igny8_core.auth.models import Industry, IndustrySector, SeedKeyword


# Hard-coded mapping: folder name in "US CA" -> sector slug in DB
FOLDER_TO_SECTOR_SLUG = {
    'Automation & Workflow Systems': 'automation-workflow-systems',
    'Cloud Services': 'cloud-services',
    'DATA & AI Services': 'data-ai-services',
    'Digital Marketing & SEO': 'digital-marketing-seo',
    'SAAS': 'saas',
    'Web Development & Design': 'web-development-design',
}

INDUSTRY_ID = 27  # Technology & IT Services

BASE_PATH = Path('/data/app/igny8/KW_DB/Technology_&_IT_Services/US CA')


def parse_csv_row(row):
    """Parse a CSV row into keyword data."""
    keyword = row.get('Keyword', '').strip()
    if not keyword:
        return None

    country_raw = row.get('Country', 'US').strip().upper()
    if not country_raw:
        country_raw = 'US'

    volume_raw = row.get('Volume', '0').strip()
    try:
        volume = int(volume_raw) if volume_raw else 0
    except (ValueError, TypeError):
        volume = 0

    difficulty_raw = row.get('Difficulty', '0').strip()
    try:
        difficulty = int(difficulty_raw) if difficulty_raw else 0
        difficulty = max(0, min(100, difficulty))
    except (ValueError, TypeError):
        difficulty = 0

    return {
        'keyword': keyword,
        'country': country_raw,
        'volume': volume,
        'difficulty': difficulty,
    }


def import_csv(csv_path, industry, sector, dry_run=False):
    """Import a single CSV file, returns stats dict."""
    stats = {'processed': 0, 'imported': 0, 'skipped_dup': 0, 'skipped_inv': 0, 'errors': 0}

    try:
        with open(csv_path, 'r', encoding='utf-8') as f:
            reader = csv.DictReader(f)
            for row in reader:
                stats['processed'] += 1
                data = parse_csv_row(row)
                if not data:
                    stats['skipped_inv'] += 1
                    continue

                # Duplicate check: keyword (case-insensitive) in same industry+sector
                # DB unique constraint is (keyword, industry_id, sector_id) - no country
                exists = SeedKeyword.objects.filter(
                    keyword__iexact=data['keyword'],
                    industry=industry,
                    sector=sector,
                ).exists()

                if exists:
                    stats['skipped_dup'] += 1
                    continue

                if not dry_run:
                    SeedKeyword.objects.create(
                        keyword=data['keyword'],
                        industry=industry,
                        sector=sector,
                        volume=data['volume'],
                        difficulty=data['difficulty'],
                        country=data['country'],
                        is_active=True,
                    )
                stats['imported'] += 1

    except Exception as e:
        print(f"      ❌ Error: {e}")
        stats['errors'] += 1

    return stats


def main():
    parser = argparse.ArgumentParser(description='Import US/CA Tech keywords')
    parser.add_argument('--dry-run', action='store_true', help='Preview without saving')
    args = parser.parse_args()

    industry = Industry.objects.get(id=INDUSTRY_ID)

    print(f"\n{'='*70}")
    print(f"IMPORT: Technology & IT Services — US CA Keywords")
    print(f"{'='*70}")
    print(f"Industry: {industry.name} (id={industry.id})")
    print(f"Mode: {'DRY RUN' if args.dry_run else 'LIVE IMPORT'}")
    print(f"{'='*70}\n")

    totals = {'files': 0, 'processed': 0, 'imported': 0, 'skipped_dup': 0, 'skipped_inv': 0, 'errors': 0}

    for folder_name, sector_slug in sorted(FOLDER_TO_SECTOR_SLUG.items()):
        folder_path = BASE_PATH / folder_name
        if not folder_path.exists():
            print(f"⚠ Folder not found: {folder_name}")
            continue

        sector = IndustrySector.objects.get(industry=industry, slug=sector_slug)
        print(f"\n📂 Sector: {sector.name} (id={sector.id}, slug={sector.slug})")

        csv_files = sorted(folder_path.glob('*.csv'))
        if not csv_files:
            print(f"   ⚠ No CSV files")
            continue

        print(f"   Found {len(csv_files)} CSV files")

        with transaction.atomic():
            for csv_file in csv_files:
                totals['files'] += 1
                stats = import_csv(csv_file, industry, sector, dry_run=args.dry_run)

                totals['processed'] += stats['processed']
                totals['imported'] += stats['imported']
                totals['skipped_dup'] += stats['skipped_dup']
                totals['skipped_inv'] += stats['skipped_inv']
                totals['errors'] += stats['errors']

                print(f"   📄 {csv_file.name}")
                print(f"      rows={stats['processed']} | ✓ imported={stats['imported']} | ⊘ dup={stats['skipped_dup']} | inv={stats['skipped_inv']}")

            if args.dry_run:
                transaction.set_rollback(True)

    print(f"\n\n{'='*70}")
    print(f"SUMMARY")
    print(f"{'='*70}")
    print(f"Total CSV files:       {totals['files']}")
    print(f"Total rows processed:  {totals['processed']}")
    print(f"✓ Imported:            {totals['imported']}")
    print(f"⊘ Skipped (duplicate): {totals['skipped_dup']}")
    print(f"⊘ Skipped (invalid):   {totals['skipped_inv']}")
    print(f"✗ Errors:              {totals['errors']}")
    print(f"{'='*70}\n")

    if args.dry_run:
        print("ℹ DRY RUN — no data saved. Remove --dry-run to import.\n")
    else:
        print("✓ Import completed!\n")


if __name__ == '__main__':
    main()