Files
igny8/KW_DB/management/import_single_csv.py
2026-01-13 12:00:16 +00:00

302 lines
10 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python
"""
Import Seed Keywords from Single CSV File
This script imports keywords from a single CSV file into the IGNY8 global keywords database.
Use this for testing before running full import.
Usage:
cd /data/app/igny8/backend
python ../KW_DB/management/import_single_csv.py \\
--csv /data/app/igny8/KW_DB/HealthCare_Medical/Physiotherapy_Rehabilitation/google_us_physical-therapy_matching-terms_2025-12-19_04-25-15.csv \\
--industry "HealthCare Medical" \\
--sector "Physiotherapy Rehabilitation" \\
--dry-run
Author: IGNY8 Team
Date: January 13, 2026
"""
import os
import sys
import csv
import argparse
from pathlib import Path
# Add Django project to path
# Use /app when running in Docker, /data/app/igny8/backend when running locally
backend_path = '/app' if os.path.exists('/app/manage.py') else '/data/app/igny8/backend'
sys.path.insert(0, backend_path)
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'igny8_core.settings')
import django
django.setup()
from django.utils.text import slugify
from django.db import transaction
from igny8_core.auth.models import Industry, IndustrySector, SeedKeyword
class KeywordImporter:
"""Import keywords from CSV into database"""
def __init__(self, dry_run=False, verbose=False):
self.dry_run = dry_run
self.verbose = verbose
self.stats = {
'processed': 0,
'imported': 0,
'skipped_duplicate': 0,
'skipped_invalid': 0,
'errors': 0
}
def log(self, message, force=False):
"""Print message if verbose or forced"""
if self.verbose or force:
print(message)
def get_or_create_industry(self, name):
"""Get or create Industry record"""
slug = slugify(name)
if self.dry_run:
self.log(f"[DRY RUN] Would get/create Industry: {name} (slug: {slug})")
# Return a mock object for dry run
class MockIndustry:
id = 0
name = name
slug = slug
return MockIndustry(), False
industry, created = Industry.objects.get_or_create(
slug=slug,
defaults={
'name': name,
'is_active': True,
'description': f'Auto-imported from KW_DB'
}
)
if created:
self.log(f"✓ Created Industry: {name}", force=True)
else:
self.log(f"✓ Found existing Industry: {name}")
return industry, created
def get_or_create_sector(self, industry, name):
"""Get or create IndustrySector record"""
slug = slugify(name)
if self.dry_run:
self.log(f"[DRY RUN] Would get/create Sector: {name} (slug: {slug})")
class MockSector:
id = 0
name = name
slug = slug
return MockSector(), False
sector, created = IndustrySector.objects.get_or_create(
industry=industry,
slug=slug,
defaults={
'name': name,
'is_active': True,
'description': f'Auto-imported from KW_DB'
}
)
if created:
self.log(f" ✓ Created Sector: {name}", force=True)
else:
self.log(f" ✓ Found existing Sector: {name}")
return sector, created
def is_duplicate(self, keyword, country, industry, sector):
"""
Check if keyword already exists with same country in this industry+sector.
Duplicate check: keyword + country (case-insensitive)
"""
if self.dry_run:
return False # Skip duplicate check in dry run
exists = SeedKeyword.objects.filter(
keyword__iexact=keyword,
country=country,
industry=industry,
sector=sector
).exists()
return exists
def import_keyword(self, keyword_data, industry, sector):
"""Import single keyword record"""
keyword = keyword_data['keyword']
country = keyword_data['country']
volume = keyword_data['volume']
difficulty = keyword_data['difficulty']
# Check for duplicate (keyword + country)
if self.is_duplicate(keyword, country, industry, sector):
self.log(f" ⊘ SKIP (duplicate): {keyword} [{country}]")
self.stats['skipped_duplicate'] += 1
return False
if self.dry_run:
self.log(f" [DRY RUN] Would import: {keyword} [{country}] (vol:{volume}, diff:{difficulty})")
return True
# Create keyword
SeedKeyword.objects.create(
keyword=keyword,
industry=industry,
sector=sector,
volume=volume,
difficulty=difficulty,
country=country,
is_active=True
)
self.log(f" ✓ Imported: {keyword} [{country}] (vol:{volume}, diff:{difficulty})")
return True
def parse_csv_row(self, row):
"""Parse CSV row and extract keyword data"""
try:
keyword = row.get('Keyword', '').strip()
if not keyword:
return None
# Parse country (default to US)
country_raw = row.get('Country', 'us').strip().upper()
if not country_raw:
country_raw = 'US'
# Parse volume (default to 0)
volume_raw = row.get('Volume', '0').strip()
try:
volume = int(volume_raw) if volume_raw else 0
except (ValueError, TypeError):
volume = 0
# Parse difficulty (default to 0, clamp to 0-100)
difficulty_raw = row.get('Difficulty', '0').strip()
try:
difficulty = int(difficulty_raw) if difficulty_raw else 0
difficulty = max(0, min(100, difficulty)) # Clamp to 0-100
except (ValueError, TypeError):
difficulty = 0
return {
'keyword': keyword,
'country': country_raw,
'volume': volume,
'difficulty': difficulty
}
except Exception as e:
self.log(f" ⚠ Error parsing row: {e}")
return None
def import_csv(self, csv_path, industry_name, sector_name):
"""Import keywords from CSV file"""
csv_path = Path(csv_path)
if not csv_path.exists():
print(f"❌ ERROR: CSV file not found: {csv_path}")
return False
print(f"\n{'='*60}")
print(f"IMPORTING FROM: {csv_path.name}")
print(f"Industry: {industry_name}")
print(f"Sector: {sector_name}")
if self.dry_run:
print("MODE: DRY RUN (no database changes)")
print(f"{'='*60}\n")
# Get or create Industry and Sector
industry, _ = self.get_or_create_industry(industry_name)
sector, _ = self.get_or_create_sector(industry, sector_name)
# Read and import CSV
print(f"\nProcessing keywords...")
try:
with transaction.atomic():
with open(csv_path, 'r', encoding='utf-8') as f:
reader = csv.DictReader(f)
for row in reader:
self.stats['processed'] += 1
keyword_data = self.parse_csv_row(row)
if not keyword_data:
self.stats['skipped_invalid'] += 1
continue
if self.import_keyword(keyword_data, industry, sector):
self.stats['imported'] += 1
# Rollback in dry run mode
if self.dry_run:
transaction.set_rollback(True)
except Exception as e:
print(f"\n❌ ERROR: {e}")
self.stats['errors'] += 1
return False
# Print summary
print(f"\n{'='*60}")
print(f"IMPORT SUMMARY")
print(f"{'='*60}")
print(f"Total rows processed: {self.stats['processed']}")
print(f"✓ Imported: {self.stats['imported']}")
print(f"⊘ Skipped (dup): {self.stats['skipped_duplicate']}")
print(f"⊘ Skipped (invalid): {self.stats['skipped_invalid']}")
print(f"✗ Errors: {self.stats['errors']}")
print(f"{'='*60}\n")
if self.dry_run:
print(" This was a DRY RUN - no data was saved to database")
print("Remove --dry-run flag to perform actual import\n")
else:
print("✓ Import completed successfully!\n")
return True
def main():
parser = argparse.ArgumentParser(
description='Import seed keywords from single CSV file',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Dry run (preview only)
python import_single_csv.py --csv /path/to/file.csv --industry "HealthCare Medical" --sector "Physiotherapy Rehabilitation" --dry-run
# Actual import with verbose output
python import_single_csv.py --csv /path/to/file.csv --industry "HealthCare Medical" --sector "Physiotherapy Rehabilitation" --verbose
"""
)
parser.add_argument('--csv', required=True, help='Path to CSV file')
parser.add_argument('--industry', required=True, help='Industry name')
parser.add_argument('--sector', required=True, help='Sector name')
parser.add_argument('--dry-run', action='store_true', help='Preview without saving to database')
parser.add_argument('--verbose', action='store_true', help='Show detailed progress')
args = parser.parse_args()
# Create importer and run
importer = KeywordImporter(dry_run=args.dry_run, verbose=args.verbose)
success = importer.import_csv(args.csv, args.industry, args.sector)
sys.exit(0 if success else 1)
if __name__ == '__main__':
main()