389 lines
14 KiB
Python
389 lines
14 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Import All Seed Keywords from KW_DB Folder Structure
|
||
|
||
This script automatically scans the KW_DB folder structure and imports all CSV files.
|
||
It extracts Industry and Sector from folder names and imports keywords with duplicate checking.
|
||
|
||
FOLDER STRUCTURE EXPECTED:
|
||
/data/app/igny8/KW_DB/
|
||
{Industry}/
|
||
{Sector}/
|
||
*.csv files
|
||
|
||
DUPLICATE HANDLING:
|
||
- Checks: keyword + country (case-insensitive)
|
||
- If duplicate exists in same industry+sector: SKIPS import
|
||
|
||
Usage:
|
||
# Dry run (preview all imports)
|
||
docker compose -f docker-compose.app.yml exec igny8_backend \\
|
||
python3 /app/scripts/import_all_seed_keywords.py \\
|
||
--base-path /data/app/igny8/KW_DB \\
|
||
--dry-run --verbose
|
||
|
||
# Actual import
|
||
docker compose -f docker-compose.app.yml exec igny8_backend \\
|
||
python3 /app/scripts/import_all_seed_keywords.py \\
|
||
--base-path /data/app/igny8/KW_DB
|
||
|
||
Author: IGNY8 Team
|
||
Date: January 13, 2026
|
||
"""
|
||
|
||
import os
|
||
import sys
|
||
import csv
|
||
import argparse
|
||
import django
|
||
from pathlib import Path
|
||
|
||
# Change to app directory for Django imports
|
||
sys.path.insert(0, '/app')
|
||
os.chdir('/app')
|
||
|
||
# Setup Django
|
||
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'igny8_core.settings')
|
||
django.setup()
|
||
|
||
from django.utils.text import slugify
|
||
from django.db import transaction
|
||
from igny8_core.auth.models import Industry, IndustrySector, SeedKeyword
|
||
|
||
|
||
class BulkKeywordImporter:
|
||
"""Import keywords from entire folder structure"""
|
||
|
||
def __init__(self, base_path, dry_run=False, verbose=False):
|
||
self.base_path = Path(base_path)
|
||
self.dry_run = dry_run
|
||
self.verbose = verbose
|
||
self.global_stats = {
|
||
'total_files': 0,
|
||
'total_processed': 0,
|
||
'total_imported': 0,
|
||
'total_skipped_duplicate': 0,
|
||
'total_skipped_invalid': 0,
|
||
'total_errors': 0,
|
||
'industries_created': 0,
|
||
'sectors_created': 0
|
||
}
|
||
self.file_stats = []
|
||
|
||
def log(self, message, force=False):
|
||
"""Print message if verbose or forced"""
|
||
if self.verbose or force:
|
||
print(message)
|
||
|
||
def get_or_create_industry(self, name):
|
||
"""Get or create Industry record"""
|
||
slug = slugify(name)
|
||
|
||
if self.dry_run:
|
||
self.log(f"[DRY RUN] Would get/create Industry: {name} (slug: {slug})")
|
||
class MockIndustry:
|
||
def __init__(self):
|
||
self.id = 0
|
||
self.name = name
|
||
self.slug = slug
|
||
return MockIndustry(), False
|
||
|
||
industry, created = Industry.objects.get_or_create(
|
||
slug=slug,
|
||
defaults={
|
||
'name': name,
|
||
'is_active': True,
|
||
'description': f'Auto-imported from KW_DB'
|
||
}
|
||
)
|
||
|
||
if created:
|
||
self.log(f"✓ Created Industry: {name}", force=True)
|
||
self.global_stats['industries_created'] += 1
|
||
else:
|
||
self.log(f"✓ Found existing Industry: {name}")
|
||
|
||
return industry, created
|
||
|
||
def get_or_create_sector(self, industry, name):
|
||
"""Get or create IndustrySector record"""
|
||
slug = slugify(name)
|
||
|
||
if self.dry_run:
|
||
self.log(f"[DRY RUN] Would get/create Sector: {name} (slug: {slug})")
|
||
class MockSector:
|
||
def __init__(self):
|
||
self.id = 0
|
||
self.name = name
|
||
self.slug = slug
|
||
return MockSector(), False
|
||
|
||
sector, created = IndustrySector.objects.get_or_create(
|
||
industry=industry,
|
||
slug=slug,
|
||
defaults={
|
||
'name': name,
|
||
'is_active': True,
|
||
'description': f'Auto-imported from KW_DB'
|
||
}
|
||
)
|
||
|
||
if created:
|
||
self.log(f" ✓ Created Sector: {name}", force=True)
|
||
self.global_stats['sectors_created'] += 1
|
||
else:
|
||
self.log(f" ✓ Found existing Sector: {name}")
|
||
|
||
return sector, created
|
||
|
||
def is_duplicate(self, keyword, country, industry, sector):
|
||
"""
|
||
Check if keyword already exists with same country in this industry+sector.
|
||
Duplicate check: keyword + country (case-insensitive)
|
||
"""
|
||
if self.dry_run:
|
||
return False # Skip duplicate check in dry run
|
||
|
||
exists = SeedKeyword.objects.filter(
|
||
keyword__iexact=keyword,
|
||
country=country,
|
||
industry=industry,
|
||
sector=sector
|
||
).exists()
|
||
|
||
return exists
|
||
|
||
def parse_csv_row(self, row):
|
||
"""Parse CSV row and extract keyword data"""
|
||
try:
|
||
keyword = row.get('Keyword', '').strip()
|
||
if not keyword:
|
||
return None
|
||
|
||
# Parse country (default to US)
|
||
country_raw = row.get('Country', 'us').strip().upper()
|
||
if not country_raw:
|
||
country_raw = 'US'
|
||
|
||
# Parse volume (default to 0)
|
||
volume_raw = row.get('Volume', '0').strip()
|
||
try:
|
||
volume = int(volume_raw) if volume_raw else 0
|
||
except (ValueError, TypeError):
|
||
volume = 0
|
||
|
||
# Parse difficulty (default to 0, clamp to 0-100)
|
||
difficulty_raw = row.get('Difficulty', '0').strip()
|
||
try:
|
||
difficulty = int(difficulty_raw) if difficulty_raw else 0
|
||
difficulty = max(0, min(100, difficulty)) # Clamp to 0-100
|
||
except (ValueError, TypeError):
|
||
difficulty = 0
|
||
|
||
return {
|
||
'keyword': keyword,
|
||
'country': country_raw,
|
||
'volume': volume,
|
||
'difficulty': difficulty
|
||
}
|
||
|
||
except Exception as e:
|
||
self.log(f" ⚠ Error parsing row: {e}")
|
||
return None
|
||
|
||
def import_csv_file(self, csv_path, industry, sector):
|
||
"""Import keywords from single CSV file"""
|
||
file_stats = {
|
||
'file': csv_path.name,
|
||
'processed': 0,
|
||
'imported': 0,
|
||
'skipped_duplicate': 0,
|
||
'skipped_invalid': 0,
|
||
'errors': 0
|
||
}
|
||
|
||
try:
|
||
with open(csv_path, 'r', encoding='utf-8') as f:
|
||
reader = csv.DictReader(f)
|
||
|
||
for row in reader:
|
||
file_stats['processed'] += 1
|
||
self.global_stats['total_processed'] += 1
|
||
|
||
keyword_data = self.parse_csv_row(row)
|
||
if not keyword_data:
|
||
file_stats['skipped_invalid'] += 1
|
||
self.global_stats['total_skipped_invalid'] += 1
|
||
continue
|
||
|
||
keyword = keyword_data['keyword']
|
||
country = keyword_data['country']
|
||
volume = keyword_data['volume']
|
||
difficulty = keyword_data['difficulty']
|
||
|
||
# Check for duplicate (keyword + country)
|
||
if self.is_duplicate(keyword, country, industry, sector):
|
||
self.log(f" ⊘ SKIP (duplicate): {keyword} [{country}]")
|
||
file_stats['skipped_duplicate'] += 1
|
||
self.global_stats['total_skipped_duplicate'] += 1
|
||
continue
|
||
|
||
if self.dry_run:
|
||
self.log(f" [DRY RUN] Would import: {keyword} [{country}] (vol:{volume}, diff:{difficulty})")
|
||
else:
|
||
# Create keyword
|
||
SeedKeyword.objects.create(
|
||
keyword=keyword,
|
||
industry=industry,
|
||
sector=sector,
|
||
volume=volume,
|
||
difficulty=difficulty,
|
||
country=country,
|
||
is_active=True
|
||
)
|
||
self.log(f" ✓ Imported: {keyword} [{country}]")
|
||
|
||
file_stats['imported'] += 1
|
||
self.global_stats['total_imported'] += 1
|
||
|
||
except Exception as e:
|
||
self.log(f" ❌ Error processing file: {e}", force=True)
|
||
file_stats['errors'] += 1
|
||
self.global_stats['total_errors'] += 1
|
||
|
||
return file_stats
|
||
|
||
def scan_and_import(self):
|
||
"""Scan folder structure and import all CSV files"""
|
||
|
||
if not self.base_path.exists():
|
||
print(f"❌ ERROR: Base path not found: {self.base_path}")
|
||
return False
|
||
|
||
print(f"\n{'='*70}")
|
||
print(f"BULK IMPORT: ALL SEED KEYWORDS FROM KW_DB")
|
||
print(f"{'='*70}")
|
||
print(f"Base Path: {self.base_path}")
|
||
if self.dry_run:
|
||
print("Mode: DRY RUN (no database changes)")
|
||
print(f"{'='*70}\n")
|
||
|
||
# Scan directory structure: Industry / Sector / *.csv
|
||
for industry_dir in sorted(self.base_path.iterdir()):
|
||
if not industry_dir.is_dir():
|
||
continue
|
||
|
||
industry_name = industry_dir.name.replace('_', ' ')
|
||
|
||
print(f"\n📁 Industry: {industry_name}")
|
||
print(f"{'─'*70}")
|
||
|
||
industry, _ = self.get_or_create_industry(industry_name)
|
||
|
||
for sector_dir in sorted(industry_dir.iterdir()):
|
||
if not sector_dir.is_dir():
|
||
continue
|
||
|
||
sector_name = sector_dir.name.replace('_', ' ')
|
||
|
||
print(f"\n 📂 Sector: {sector_name}")
|
||
|
||
sector, _ = self.get_or_create_sector(industry, sector_name)
|
||
|
||
# Find all CSV files
|
||
csv_files = list(sector_dir.glob('*.csv'))
|
||
|
||
if not csv_files:
|
||
print(f" ⚠ No CSV files found")
|
||
continue
|
||
|
||
print(f" Found {len(csv_files)} CSV files")
|
||
|
||
# Import each CSV file
|
||
with transaction.atomic():
|
||
for csv_file in sorted(csv_files):
|
||
self.global_stats['total_files'] += 1
|
||
print(f"\n 📄 {csv_file.name}")
|
||
|
||
file_stats = self.import_csv_file(csv_file, industry, sector)
|
||
self.file_stats.append(file_stats)
|
||
|
||
print(f" ✓ {file_stats['imported']} imported, "
|
||
f"⊘ {file_stats['skipped_duplicate']} duplicates, "
|
||
f"⊘ {file_stats['skipped_invalid']} invalid")
|
||
|
||
# Rollback in dry run mode
|
||
if self.dry_run:
|
||
transaction.set_rollback(True)
|
||
|
||
# Print summary
|
||
print(f"\n\n{'='*70}")
|
||
print(f"GLOBAL IMPORT SUMMARY")
|
||
print(f"{'='*70}")
|
||
print(f"Total CSV files: {self.global_stats['total_files']}")
|
||
print(f"Industries created: {self.global_stats['industries_created']}")
|
||
print(f"Sectors created: {self.global_stats['sectors_created']}")
|
||
print(f"─────────────────────────────────────────────────────────────────────")
|
||
print(f"Total rows processed: {self.global_stats['total_processed']}")
|
||
print(f"✓ Total imported: {self.global_stats['total_imported']}")
|
||
print(f"⊘ Skipped (duplicate): {self.global_stats['total_skipped_duplicate']}")
|
||
print(f"⊘ Skipped (invalid): {self.global_stats['total_skipped_invalid']}")
|
||
print(f"✗ Total errors: {self.global_stats['total_errors']}")
|
||
print(f"{'='*70}\n")
|
||
|
||
if self.dry_run:
|
||
print("ℹ This was a DRY RUN - no data was saved to database")
|
||
print("Remove --dry-run flag to perform actual import\n")
|
||
else:
|
||
print("✓ Import completed successfully!")
|
||
print(f"✓ Check Django admin: /admin/auth/seedkeyword/\n")
|
||
|
||
return True
|
||
|
||
|
||
def main():
|
||
parser = argparse.ArgumentParser(
|
||
description='Import all seed keywords from KW_DB folder structure',
|
||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||
epilog="""
|
||
Examples:
|
||
# Dry run (preview all imports)
|
||
docker compose -f docker-compose.app.yml exec igny8_backend \\
|
||
python3 /app/scripts/import_all_seed_keywords.py \\
|
||
--base-path /data/app/igny8/KW_DB \\
|
||
--dry-run --verbose
|
||
|
||
# Actual import
|
||
docker compose -f docker-compose.app.yml exec igny8_backend \\
|
||
python3 /app/scripts/import_all_seed_keywords.py \\
|
||
--base-path /data/app/igny8/KW_DB
|
||
|
||
Folder Structure Expected:
|
||
/data/app/igny8/KW_DB/
|
||
HealthCare_Medical/
|
||
Physiotherapy_Rehabilitation/
|
||
google_us_physical-therapy*.csv
|
||
google_us_muscle-stimulator*.csv
|
||
...
|
||
"""
|
||
)
|
||
|
||
parser.add_argument('--base-path', default='/data/app/igny8/KW_DB',
|
||
help='Base path to KW_DB folder (default: /data/app/igny8/KW_DB)')
|
||
parser.add_argument('--dry-run', action='store_true',
|
||
help='Preview without saving to database')
|
||
parser.add_argument('--verbose', action='store_true',
|
||
help='Show detailed progress for each keyword')
|
||
|
||
args = parser.parse_args()
|
||
|
||
# Create importer and run
|
||
importer = BulkKeywordImporter(args.base_path, dry_run=args.dry_run, verbose=args.verbose)
|
||
success = importer.scan_and_import()
|
||
|
||
sys.exit(0 if success else 1)
|
||
|
||
|
||
if __name__ == '__main__':
|
||
main()
|