Files
igny8/backend/scripts/import_all_seed_keywords.py
2026-01-13 12:00:16 +00:00

389 lines
14 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
Import All Seed Keywords from KW_DB Folder Structure
This script automatically scans the KW_DB folder structure and imports all CSV files.
It extracts Industry and Sector from folder names and imports keywords with duplicate checking.
FOLDER STRUCTURE EXPECTED:
/data/app/igny8/KW_DB/
{Industry}/
{Sector}/
*.csv files
DUPLICATE HANDLING:
- Checks: keyword + country (case-insensitive)
- If duplicate exists in same industry+sector: SKIPS import
Usage:
# Dry run (preview all imports)
docker compose -f docker-compose.app.yml exec igny8_backend \\
python3 /app/scripts/import_all_seed_keywords.py \\
--base-path /data/app/igny8/KW_DB \\
--dry-run --verbose
# Actual import
docker compose -f docker-compose.app.yml exec igny8_backend \\
python3 /app/scripts/import_all_seed_keywords.py \\
--base-path /data/app/igny8/KW_DB
Author: IGNY8 Team
Date: January 13, 2026
"""
import os
import sys
import csv
import argparse
import django
from pathlib import Path
# Change to app directory for Django imports
sys.path.insert(0, '/app')
os.chdir('/app')
# Setup Django
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'igny8_core.settings')
django.setup()
from django.utils.text import slugify
from django.db import transaction
from igny8_core.auth.models import Industry, IndustrySector, SeedKeyword
class BulkKeywordImporter:
"""Import keywords from entire folder structure"""
def __init__(self, base_path, dry_run=False, verbose=False):
self.base_path = Path(base_path)
self.dry_run = dry_run
self.verbose = verbose
self.global_stats = {
'total_files': 0,
'total_processed': 0,
'total_imported': 0,
'total_skipped_duplicate': 0,
'total_skipped_invalid': 0,
'total_errors': 0,
'industries_created': 0,
'sectors_created': 0
}
self.file_stats = []
def log(self, message, force=False):
"""Print message if verbose or forced"""
if self.verbose or force:
print(message)
def get_or_create_industry(self, name):
"""Get or create Industry record"""
slug = slugify(name)
if self.dry_run:
self.log(f"[DRY RUN] Would get/create Industry: {name} (slug: {slug})")
class MockIndustry:
def __init__(self):
self.id = 0
self.name = name
self.slug = slug
return MockIndustry(), False
industry, created = Industry.objects.get_or_create(
slug=slug,
defaults={
'name': name,
'is_active': True,
'description': f'Auto-imported from KW_DB'
}
)
if created:
self.log(f"✓ Created Industry: {name}", force=True)
self.global_stats['industries_created'] += 1
else:
self.log(f"✓ Found existing Industry: {name}")
return industry, created
def get_or_create_sector(self, industry, name):
"""Get or create IndustrySector record"""
slug = slugify(name)
if self.dry_run:
self.log(f"[DRY RUN] Would get/create Sector: {name} (slug: {slug})")
class MockSector:
def __init__(self):
self.id = 0
self.name = name
self.slug = slug
return MockSector(), False
sector, created = IndustrySector.objects.get_or_create(
industry=industry,
slug=slug,
defaults={
'name': name,
'is_active': True,
'description': f'Auto-imported from KW_DB'
}
)
if created:
self.log(f" ✓ Created Sector: {name}", force=True)
self.global_stats['sectors_created'] += 1
else:
self.log(f" ✓ Found existing Sector: {name}")
return sector, created
def is_duplicate(self, keyword, country, industry, sector):
"""
Check if keyword already exists with same country in this industry+sector.
Duplicate check: keyword + country (case-insensitive)
"""
if self.dry_run:
return False # Skip duplicate check in dry run
exists = SeedKeyword.objects.filter(
keyword__iexact=keyword,
country=country,
industry=industry,
sector=sector
).exists()
return exists
def parse_csv_row(self, row):
"""Parse CSV row and extract keyword data"""
try:
keyword = row.get('Keyword', '').strip()
if not keyword:
return None
# Parse country (default to US)
country_raw = row.get('Country', 'us').strip().upper()
if not country_raw:
country_raw = 'US'
# Parse volume (default to 0)
volume_raw = row.get('Volume', '0').strip()
try:
volume = int(volume_raw) if volume_raw else 0
except (ValueError, TypeError):
volume = 0
# Parse difficulty (default to 0, clamp to 0-100)
difficulty_raw = row.get('Difficulty', '0').strip()
try:
difficulty = int(difficulty_raw) if difficulty_raw else 0
difficulty = max(0, min(100, difficulty)) # Clamp to 0-100
except (ValueError, TypeError):
difficulty = 0
return {
'keyword': keyword,
'country': country_raw,
'volume': volume,
'difficulty': difficulty
}
except Exception as e:
self.log(f" ⚠ Error parsing row: {e}")
return None
def import_csv_file(self, csv_path, industry, sector):
"""Import keywords from single CSV file"""
file_stats = {
'file': csv_path.name,
'processed': 0,
'imported': 0,
'skipped_duplicate': 0,
'skipped_invalid': 0,
'errors': 0
}
try:
with open(csv_path, 'r', encoding='utf-8') as f:
reader = csv.DictReader(f)
for row in reader:
file_stats['processed'] += 1
self.global_stats['total_processed'] += 1
keyword_data = self.parse_csv_row(row)
if not keyword_data:
file_stats['skipped_invalid'] += 1
self.global_stats['total_skipped_invalid'] += 1
continue
keyword = keyword_data['keyword']
country = keyword_data['country']
volume = keyword_data['volume']
difficulty = keyword_data['difficulty']
# Check for duplicate (keyword + country)
if self.is_duplicate(keyword, country, industry, sector):
self.log(f" ⊘ SKIP (duplicate): {keyword} [{country}]")
file_stats['skipped_duplicate'] += 1
self.global_stats['total_skipped_duplicate'] += 1
continue
if self.dry_run:
self.log(f" [DRY RUN] Would import: {keyword} [{country}] (vol:{volume}, diff:{difficulty})")
else:
# Create keyword
SeedKeyword.objects.create(
keyword=keyword,
industry=industry,
sector=sector,
volume=volume,
difficulty=difficulty,
country=country,
is_active=True
)
self.log(f" ✓ Imported: {keyword} [{country}]")
file_stats['imported'] += 1
self.global_stats['total_imported'] += 1
except Exception as e:
self.log(f" ❌ Error processing file: {e}", force=True)
file_stats['errors'] += 1
self.global_stats['total_errors'] += 1
return file_stats
def scan_and_import(self):
"""Scan folder structure and import all CSV files"""
if not self.base_path.exists():
print(f"❌ ERROR: Base path not found: {self.base_path}")
return False
print(f"\n{'='*70}")
print(f"BULK IMPORT: ALL SEED KEYWORDS FROM KW_DB")
print(f"{'='*70}")
print(f"Base Path: {self.base_path}")
if self.dry_run:
print("Mode: DRY RUN (no database changes)")
print(f"{'='*70}\n")
# Scan directory structure: Industry / Sector / *.csv
for industry_dir in sorted(self.base_path.iterdir()):
if not industry_dir.is_dir():
continue
industry_name = industry_dir.name.replace('_', ' ')
print(f"\n📁 Industry: {industry_name}")
print(f"{''*70}")
industry, _ = self.get_or_create_industry(industry_name)
for sector_dir in sorted(industry_dir.iterdir()):
if not sector_dir.is_dir():
continue
sector_name = sector_dir.name.replace('_', ' ')
print(f"\n 📂 Sector: {sector_name}")
sector, _ = self.get_or_create_sector(industry, sector_name)
# Find all CSV files
csv_files = list(sector_dir.glob('*.csv'))
if not csv_files:
print(f" ⚠ No CSV files found")
continue
print(f" Found {len(csv_files)} CSV files")
# Import each CSV file
with transaction.atomic():
for csv_file in sorted(csv_files):
self.global_stats['total_files'] += 1
print(f"\n 📄 {csv_file.name}")
file_stats = self.import_csv_file(csv_file, industry, sector)
self.file_stats.append(file_stats)
print(f"{file_stats['imported']} imported, "
f"{file_stats['skipped_duplicate']} duplicates, "
f"{file_stats['skipped_invalid']} invalid")
# Rollback in dry run mode
if self.dry_run:
transaction.set_rollback(True)
# Print summary
print(f"\n\n{'='*70}")
print(f"GLOBAL IMPORT SUMMARY")
print(f"{'='*70}")
print(f"Total CSV files: {self.global_stats['total_files']}")
print(f"Industries created: {self.global_stats['industries_created']}")
print(f"Sectors created: {self.global_stats['sectors_created']}")
print(f"─────────────────────────────────────────────────────────────────────")
print(f"Total rows processed: {self.global_stats['total_processed']}")
print(f"✓ Total imported: {self.global_stats['total_imported']}")
print(f"⊘ Skipped (duplicate): {self.global_stats['total_skipped_duplicate']}")
print(f"⊘ Skipped (invalid): {self.global_stats['total_skipped_invalid']}")
print(f"✗ Total errors: {self.global_stats['total_errors']}")
print(f"{'='*70}\n")
if self.dry_run:
print(" This was a DRY RUN - no data was saved to database")
print("Remove --dry-run flag to perform actual import\n")
else:
print("✓ Import completed successfully!")
print(f"✓ Check Django admin: /admin/auth/seedkeyword/\n")
return True
def main():
parser = argparse.ArgumentParser(
description='Import all seed keywords from KW_DB folder structure',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Dry run (preview all imports)
docker compose -f docker-compose.app.yml exec igny8_backend \\
python3 /app/scripts/import_all_seed_keywords.py \\
--base-path /data/app/igny8/KW_DB \\
--dry-run --verbose
# Actual import
docker compose -f docker-compose.app.yml exec igny8_backend \\
python3 /app/scripts/import_all_seed_keywords.py \\
--base-path /data/app/igny8/KW_DB
Folder Structure Expected:
/data/app/igny8/KW_DB/
HealthCare_Medical/
Physiotherapy_Rehabilitation/
google_us_physical-therapy*.csv
google_us_muscle-stimulator*.csv
...
"""
)
parser.add_argument('--base-path', default='/data/app/igny8/KW_DB',
help='Base path to KW_DB folder (default: /data/app/igny8/KW_DB)')
parser.add_argument('--dry-run', action='store_true',
help='Preview without saving to database')
parser.add_argument('--verbose', action='store_true',
help='Show detailed progress for each keyword')
args = parser.parse_args()
# Create importer and run
importer = BulkKeywordImporter(args.base_path, dry_run=args.dry_run, verbose=args.verbose)
success = importer.scan_and_import()
sys.exit(0 if success else 1)
if __name__ == '__main__':
main()