#!/usr/bin/env python3 """ Import All Seed Keywords from KW_DB Folder Structure This script automatically scans the KW_DB folder structure and imports all CSV files. It extracts Industry and Sector from folder names and imports keywords with duplicate checking. FOLDER STRUCTURE EXPECTED: /data/app/igny8/KW_DB/ {Industry}/ {Sector}/ *.csv files DUPLICATE HANDLING: - Checks: keyword + country (case-insensitive) - If duplicate exists in same industry+sector: SKIPS import Usage: # Dry run (preview all imports) docker compose -f docker-compose.app.yml exec igny8_backend \\ python3 /app/scripts/import_all_seed_keywords.py \\ --base-path /data/app/igny8/KW_DB \\ --dry-run --verbose # Actual import docker compose -f docker-compose.app.yml exec igny8_backend \\ python3 /app/scripts/import_all_seed_keywords.py \\ --base-path /data/app/igny8/KW_DB Author: IGNY8 Team Date: January 13, 2026 """ import os import sys import csv import argparse import django from pathlib import Path # Change to app directory for Django imports sys.path.insert(0, '/app') os.chdir('/app') # Setup Django os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'igny8_core.settings') django.setup() from django.utils.text import slugify from django.db import transaction from igny8_core.auth.models import Industry, IndustrySector, SeedKeyword class BulkKeywordImporter: """Import keywords from entire folder structure""" def __init__(self, base_path, dry_run=False, verbose=False): self.base_path = Path(base_path) self.dry_run = dry_run self.verbose = verbose self.global_stats = { 'total_files': 0, 'total_processed': 0, 'total_imported': 0, 'total_skipped_duplicate': 0, 'total_skipped_invalid': 0, 'total_errors': 0, 'industries_created': 0, 'sectors_created': 0 } self.file_stats = [] def log(self, message, force=False): """Print message if verbose or forced""" if self.verbose or force: print(message) def get_or_create_industry(self, name): """Get or create Industry record""" slug = slugify(name) if self.dry_run: self.log(f"[DRY RUN] Would get/create Industry: {name} (slug: {slug})") class MockIndustry: def __init__(self): self.id = 0 self.name = name self.slug = slug return MockIndustry(), False industry, created = Industry.objects.get_or_create( slug=slug, defaults={ 'name': name, 'is_active': True, 'description': f'Auto-imported from KW_DB' } ) if created: self.log(f"✓ Created Industry: {name}", force=True) self.global_stats['industries_created'] += 1 else: self.log(f"✓ Found existing Industry: {name}") return industry, created def get_or_create_sector(self, industry, name): """Get or create IndustrySector record""" slug = slugify(name) if self.dry_run: self.log(f"[DRY RUN] Would get/create Sector: {name} (slug: {slug})") class MockSector: def __init__(self): self.id = 0 self.name = name self.slug = slug return MockSector(), False sector, created = IndustrySector.objects.get_or_create( industry=industry, slug=slug, defaults={ 'name': name, 'is_active': True, 'description': f'Auto-imported from KW_DB' } ) if created: self.log(f" ✓ Created Sector: {name}", force=True) self.global_stats['sectors_created'] += 1 else: self.log(f" ✓ Found existing Sector: {name}") return sector, created def is_duplicate(self, keyword, country, industry, sector): """ Check if keyword already exists with same country in this industry+sector. Duplicate check: keyword + country (case-insensitive) """ if self.dry_run: return False # Skip duplicate check in dry run exists = SeedKeyword.objects.filter( keyword__iexact=keyword, country=country, industry=industry, sector=sector ).exists() return exists def parse_csv_row(self, row): """Parse CSV row and extract keyword data""" try: keyword = row.get('Keyword', '').strip() if not keyword: return None # Parse country (default to US) country_raw = row.get('Country', 'us').strip().upper() if not country_raw: country_raw = 'US' # Parse volume (default to 0) volume_raw = row.get('Volume', '0').strip() try: volume = int(volume_raw) if volume_raw else 0 except (ValueError, TypeError): volume = 0 # Parse difficulty (default to 0, clamp to 0-100) difficulty_raw = row.get('Difficulty', '0').strip() try: difficulty = int(difficulty_raw) if difficulty_raw else 0 difficulty = max(0, min(100, difficulty)) # Clamp to 0-100 except (ValueError, TypeError): difficulty = 0 return { 'keyword': keyword, 'country': country_raw, 'volume': volume, 'difficulty': difficulty } except Exception as e: self.log(f" ⚠ Error parsing row: {e}") return None def import_csv_file(self, csv_path, industry, sector): """Import keywords from single CSV file""" file_stats = { 'file': csv_path.name, 'processed': 0, 'imported': 0, 'skipped_duplicate': 0, 'skipped_invalid': 0, 'errors': 0 } try: with open(csv_path, 'r', encoding='utf-8') as f: reader = csv.DictReader(f) for row in reader: file_stats['processed'] += 1 self.global_stats['total_processed'] += 1 keyword_data = self.parse_csv_row(row) if not keyword_data: file_stats['skipped_invalid'] += 1 self.global_stats['total_skipped_invalid'] += 1 continue keyword = keyword_data['keyword'] country = keyword_data['country'] volume = keyword_data['volume'] difficulty = keyword_data['difficulty'] # Check for duplicate (keyword + country) if self.is_duplicate(keyword, country, industry, sector): self.log(f" ⊘ SKIP (duplicate): {keyword} [{country}]") file_stats['skipped_duplicate'] += 1 self.global_stats['total_skipped_duplicate'] += 1 continue if self.dry_run: self.log(f" [DRY RUN] Would import: {keyword} [{country}] (vol:{volume}, diff:{difficulty})") else: # Create keyword SeedKeyword.objects.create( keyword=keyword, industry=industry, sector=sector, volume=volume, difficulty=difficulty, country=country, is_active=True ) self.log(f" ✓ Imported: {keyword} [{country}]") file_stats['imported'] += 1 self.global_stats['total_imported'] += 1 except Exception as e: self.log(f" ❌ Error processing file: {e}", force=True) file_stats['errors'] += 1 self.global_stats['total_errors'] += 1 return file_stats def scan_and_import(self): """Scan folder structure and import all CSV files""" if not self.base_path.exists(): print(f"❌ ERROR: Base path not found: {self.base_path}") return False print(f"\n{'='*70}") print(f"BULK IMPORT: ALL SEED KEYWORDS FROM KW_DB") print(f"{'='*70}") print(f"Base Path: {self.base_path}") if self.dry_run: print("Mode: DRY RUN (no database changes)") print(f"{'='*70}\n") # Scan directory structure: Industry / Sector / *.csv for industry_dir in sorted(self.base_path.iterdir()): if not industry_dir.is_dir(): continue industry_name = industry_dir.name.replace('_', ' ') print(f"\n📁 Industry: {industry_name}") print(f"{'─'*70}") industry, _ = self.get_or_create_industry(industry_name) for sector_dir in sorted(industry_dir.iterdir()): if not sector_dir.is_dir(): continue sector_name = sector_dir.name.replace('_', ' ') print(f"\n 📂 Sector: {sector_name}") sector, _ = self.get_or_create_sector(industry, sector_name) # Find all CSV files csv_files = list(sector_dir.glob('*.csv')) if not csv_files: print(f" ⚠ No CSV files found") continue print(f" Found {len(csv_files)} CSV files") # Import each CSV file with transaction.atomic(): for csv_file in sorted(csv_files): self.global_stats['total_files'] += 1 print(f"\n 📄 {csv_file.name}") file_stats = self.import_csv_file(csv_file, industry, sector) self.file_stats.append(file_stats) print(f" ✓ {file_stats['imported']} imported, " f"⊘ {file_stats['skipped_duplicate']} duplicates, " f"⊘ {file_stats['skipped_invalid']} invalid") # Rollback in dry run mode if self.dry_run: transaction.set_rollback(True) # Print summary print(f"\n\n{'='*70}") print(f"GLOBAL IMPORT SUMMARY") print(f"{'='*70}") print(f"Total CSV files: {self.global_stats['total_files']}") print(f"Industries created: {self.global_stats['industries_created']}") print(f"Sectors created: {self.global_stats['sectors_created']}") print(f"─────────────────────────────────────────────────────────────────────") print(f"Total rows processed: {self.global_stats['total_processed']}") print(f"✓ Total imported: {self.global_stats['total_imported']}") print(f"⊘ Skipped (duplicate): {self.global_stats['total_skipped_duplicate']}") print(f"⊘ Skipped (invalid): {self.global_stats['total_skipped_invalid']}") print(f"✗ Total errors: {self.global_stats['total_errors']}") print(f"{'='*70}\n") if self.dry_run: print("ℹ This was a DRY RUN - no data was saved to database") print("Remove --dry-run flag to perform actual import\n") else: print("✓ Import completed successfully!") print(f"✓ Check Django admin: /admin/auth/seedkeyword/\n") return True def main(): parser = argparse.ArgumentParser( description='Import all seed keywords from KW_DB folder structure', formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: # Dry run (preview all imports) docker compose -f docker-compose.app.yml exec igny8_backend \\ python3 /app/scripts/import_all_seed_keywords.py \\ --base-path /data/app/igny8/KW_DB \\ --dry-run --verbose # Actual import docker compose -f docker-compose.app.yml exec igny8_backend \\ python3 /app/scripts/import_all_seed_keywords.py \\ --base-path /data/app/igny8/KW_DB Folder Structure Expected: /data/app/igny8/KW_DB/ HealthCare_Medical/ Physiotherapy_Rehabilitation/ google_us_physical-therapy*.csv google_us_muscle-stimulator*.csv ... """ ) parser.add_argument('--base-path', default='/data/app/igny8/KW_DB', help='Base path to KW_DB folder (default: /data/app/igny8/KW_DB)') parser.add_argument('--dry-run', action='store_true', help='Preview without saving to database') parser.add_argument('--verbose', action='store_true', help='Show detailed progress for each keyword') args = parser.parse_args() # Create importer and run importer = BulkKeywordImporter(args.base_path, dry_run=args.dry_run, verbose=args.verbose) success = importer.scan_and_import() sys.exit(0 if success else 1) if __name__ == '__main__': main()