Files
igny8/backend/scripts/import_us_ca_tech.py

189 lines
6.1 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
Import US/CA Technology & IT Services Keywords
Targeted import script for the "US CA" folder in Technology_&_IT_Services.
Maps folder names to existing DB sectors and imports all CSV files.
Usage:
docker exec igny8_backend python3 /app/scripts/import_us_ca_tech.py --dry-run
docker exec igny8_backend python3 /app/scripts/import_us_ca_tech.py
"""
import os
import sys
import csv
import argparse
sys.path.insert(0, '/app')
os.chdir('/app')
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'igny8_core.settings')
import django
django.setup()
from pathlib import Path
from django.db import transaction
from igny8_core.auth.models import Industry, IndustrySector, SeedKeyword
# Hard-coded mapping: folder name in "US CA" -> sector slug in DB
FOLDER_TO_SECTOR_SLUG = {
'Automation & Workflow Systems': 'automation-workflow-systems',
'Cloud Services': 'cloud-services',
'DATA & AI Services': 'data-ai-services',
'Digital Marketing & SEO': 'digital-marketing-seo',
'SAAS': 'saas',
'Web Development & Design': 'web-development-design',
}
INDUSTRY_ID = 27 # Technology & IT Services
BASE_PATH = Path('/data/app/igny8/KW_DB/Technology_&_IT_Services/US CA')
def parse_csv_row(row):
"""Parse a CSV row into keyword data."""
keyword = row.get('Keyword', '').strip()
if not keyword:
return None
country_raw = row.get('Country', 'US').strip().upper()
if not country_raw:
country_raw = 'US'
volume_raw = row.get('Volume', '0').strip()
try:
volume = int(volume_raw) if volume_raw else 0
except (ValueError, TypeError):
volume = 0
difficulty_raw = row.get('Difficulty', '0').strip()
try:
difficulty = int(difficulty_raw) if difficulty_raw else 0
difficulty = max(0, min(100, difficulty))
except (ValueError, TypeError):
difficulty = 0
return {
'keyword': keyword,
'country': country_raw,
'volume': volume,
'difficulty': difficulty,
}
def import_csv(csv_path, industry, sector, dry_run=False):
"""Import a single CSV file, returns stats dict."""
stats = {'processed': 0, 'imported': 0, 'skipped_dup': 0, 'skipped_inv': 0, 'errors': 0}
try:
with open(csv_path, 'r', encoding='utf-8') as f:
reader = csv.DictReader(f)
for row in reader:
stats['processed'] += 1
data = parse_csv_row(row)
if not data:
stats['skipped_inv'] += 1
continue
# Duplicate check: keyword (case-insensitive) in same industry+sector
# DB unique constraint is (keyword, industry_id, sector_id) - no country
exists = SeedKeyword.objects.filter(
keyword__iexact=data['keyword'],
industry=industry,
sector=sector,
).exists()
if exists:
stats['skipped_dup'] += 1
continue
if not dry_run:
SeedKeyword.objects.create(
keyword=data['keyword'],
industry=industry,
sector=sector,
volume=data['volume'],
difficulty=data['difficulty'],
country=data['country'],
is_active=True,
)
stats['imported'] += 1
except Exception as e:
print(f" ❌ Error: {e}")
stats['errors'] += 1
return stats
def main():
parser = argparse.ArgumentParser(description='Import US/CA Tech keywords')
parser.add_argument('--dry-run', action='store_true', help='Preview without saving')
args = parser.parse_args()
industry = Industry.objects.get(id=INDUSTRY_ID)
print(f"\n{'='*70}")
print(f"IMPORT: Technology & IT Services — US CA Keywords")
print(f"{'='*70}")
print(f"Industry: {industry.name} (id={industry.id})")
print(f"Mode: {'DRY RUN' if args.dry_run else 'LIVE IMPORT'}")
print(f"{'='*70}\n")
totals = {'files': 0, 'processed': 0, 'imported': 0, 'skipped_dup': 0, 'skipped_inv': 0, 'errors': 0}
for folder_name, sector_slug in sorted(FOLDER_TO_SECTOR_SLUG.items()):
folder_path = BASE_PATH / folder_name
if not folder_path.exists():
print(f"⚠ Folder not found: {folder_name}")
continue
sector = IndustrySector.objects.get(industry=industry, slug=sector_slug)
print(f"\n📂 Sector: {sector.name} (id={sector.id}, slug={sector.slug})")
csv_files = sorted(folder_path.glob('*.csv'))
if not csv_files:
print(f" ⚠ No CSV files")
continue
print(f" Found {len(csv_files)} CSV files")
with transaction.atomic():
for csv_file in csv_files:
totals['files'] += 1
stats = import_csv(csv_file, industry, sector, dry_run=args.dry_run)
totals['processed'] += stats['processed']
totals['imported'] += stats['imported']
totals['skipped_dup'] += stats['skipped_dup']
totals['skipped_inv'] += stats['skipped_inv']
totals['errors'] += stats['errors']
print(f" 📄 {csv_file.name}")
print(f" rows={stats['processed']} | ✓ imported={stats['imported']} | ⊘ dup={stats['skipped_dup']} | inv={stats['skipped_inv']}")
if args.dry_run:
transaction.set_rollback(True)
print(f"\n\n{'='*70}")
print(f"SUMMARY")
print(f"{'='*70}")
print(f"Total CSV files: {totals['files']}")
print(f"Total rows processed: {totals['processed']}")
print(f"✓ Imported: {totals['imported']}")
print(f"⊘ Skipped (duplicate): {totals['skipped_dup']}")
print(f"⊘ Skipped (invalid): {totals['skipped_inv']}")
print(f"✗ Errors: {totals['errors']}")
print(f"{'='*70}\n")
if args.dry_run:
print(" DRY RUN — no data saved. Remove --dry-run to import.\n")
else:
print("✓ Import completed!\n")
if __name__ == '__main__':
main()