Implement Stage 3: Enhance content metadata and validation features

- Added entity metadata fields to the Tasks model, including entity_type, taxonomy, and cluster_role. - Updated CandidateEngine to prioritize content relevance based on cluster mappings. - Introduced metadata completeness scoring in ContentAnalyzer. - Enhanced validation services to check for entity type and mapping completeness. - Updated frontend components to display and validate new metadata fields. - Implemented API endpoints for content validation and metadata persistence. - Migrated existing data to populate new metadata fields for Tasks and Content.
2025-11-19 19:21:30 +00:00
parent 38f6026e73
commit bae9ea47d8
33 changed files with 2388 additions and 73 deletions
--- a/backend/igny8_core/modules/writer/management/init.py
+++ b/backend/igny8_core/modules/writer/management/init.py
@@ -0,0 +1,2 @@
+# Writer management commands
+
--- a/backend/igny8_core/modules/writer/management/commands/init.py
+++ b/backend/igny8_core/modules/writer/management/commands/init.py
@@ -0,0 +1,2 @@
+# Writer management commands
+
--- a/backend/igny8_core/modules/writer/management/commands/audit_site_metadata.py
+++ b/backend/igny8_core/modules/writer/management/commands/audit_site_metadata.py
@@ -0,0 +1,114 @@
+"""
+Management command to audit site metadata gaps
+Stage 3: Summarizes metadata completeness per site
+
+Usage: python manage.py audit_site_metadata --site {id}
+"""
+from django.core.management.base import BaseCommand
+from django.db.models import Count, Q
+from igny8_core.auth.models import Site
+from igny8_core.business.content.models import (
+    Tasks,
+    Content,
+    ContentClusterMap,
+    ContentTaxonomyMap,
+    ContentAttributeMap,
+)
+
+
+class Command(BaseCommand):
+    help = 'Audit metadata completeness for a site (Stage 3)'
+
+    def add_arguments(self, parser):
+        parser.add_argument(
+            '--site',
+            type=int,
+            help='Site ID to audit (if not provided, audits all sites)',
+        )
+        parser.add_argument(
+            '--detailed',
+            action='store_true',
+            help='Show detailed breakdown by entity type',
+        )
+
+    def handle(self, *args, **options):
+        site_id = options.get('site')
+        detailed = options.get('detailed', False)
+        
+        if site_id:
+            sites = Site.objects.filter(id=site_id)
+        else:
+            sites = Site.objects.all()
+        
+        if not sites.exists():
+            self.stdout.write(self.style.ERROR(f'Site {site_id} not found'))
+            return
+        
+        for site in sites:
+            self.stdout.write(self.style.SUCCESS(f'\n{"="*80}'))
+            self.stdout.write(self.style.SUCCESS(f'Auditing Site: {site.name} (ID: {site.id})'))
+            self.stdout.write(self.style.SUCCESS(f'{"="*80}\n'))
+            
+            # Tasks audit
+            tasks = Tasks.objects.filter(site=site)
+            total_tasks = tasks.count()
+            
+            tasks_with_cluster = tasks.filter(cluster__isnull=False).count()
+            tasks_with_entity_type = tasks.filter(entity_type__isnull=False).count()
+            tasks_with_taxonomy = tasks.filter(taxonomy__isnull=False).count()
+            tasks_with_cluster_role = tasks.filter(cluster_role__isnull=False).count()
+            
+            self.stdout.write(f'\n📋 Tasks Summary:')
+            self.stdout.write(f'  Total Tasks: {total_tasks}')
+            self.stdout.write(f'  With Cluster: {tasks_with_cluster}/{total_tasks} ({tasks_with_cluster*100//total_tasks if total_tasks else 0}%)')
+            self.stdout.write(f'  With Entity Type: {tasks_with_entity_type}/{total_tasks} ({tasks_with_entity_type*100//total_tasks if total_tasks else 0}%)')
+            self.stdout.write(f'  With Taxonomy: {tasks_with_taxonomy}/{total_tasks} ({tasks_with_taxonomy*100//total_tasks if total_tasks else 0}%)')
+            self.stdout.write(f'  With Cluster Role: {tasks_with_cluster_role}/{total_tasks} ({tasks_with_cluster_role*100//total_tasks if total_tasks else 0}%)')
+            
+            # Content audit
+            content = Content.objects.filter(site=site)
+            total_content = content.count()
+            
+            content_with_entity_type = content.filter(entity_type__isnull=False).count()
+            content_with_cluster_map = ContentClusterMap.objects.filter(
+                content__site=site
+            ).values('content').distinct().count()
+            content_with_taxonomy_map = ContentTaxonomyMap.objects.filter(
+                content__site=site
+            ).values('content').distinct().count()
+            content_with_attributes = ContentAttributeMap.objects.filter(
+                content__site=site
+            ).values('content').distinct().count()
+            
+            self.stdout.write(f'\n📄 Content Summary:')
+            self.stdout.write(f'  Total Content: {total_content}')
+            self.stdout.write(f'  With Entity Type: {content_with_entity_type}/{total_content} ({content_with_entity_type*100//total_content if total_content else 0}%)')
+            self.stdout.write(f'  With Cluster Mapping: {content_with_cluster_map}/{total_content} ({content_with_cluster_map*100//total_content if total_content else 0}%)')
+            self.stdout.write(f'  With Taxonomy Mapping: {content_with_taxonomy_map}/{total_content} ({content_with_taxonomy_map*100//total_content if total_content else 0}%)')
+            self.stdout.write(f'  With Attributes: {content_with_attributes}/{total_content} ({content_with_attributes*100//total_content if total_content else 0}%)')
+            
+            # Gap analysis
+            tasks_missing_cluster = tasks.filter(cluster__isnull=True).count()
+            tasks_missing_entity_type = tasks.filter(entity_type__isnull=True).count()
+            content_missing_cluster_map = total_content - content_with_cluster_map
+            
+            self.stdout.write(f'\n⚠️  Gaps:')
+            self.stdout.write(f'  Tasks missing cluster: {tasks_missing_cluster}')
+            self.stdout.write(f'  Tasks missing entity_type: {tasks_missing_entity_type}')
+            self.stdout.write(f'  Content missing cluster mapping: {content_missing_cluster_map}')
+            
+            if detailed:
+                # Entity type breakdown
+                self.stdout.write(f'\n📊 Entity Type Breakdown:')
+                entity_types = tasks.values('entity_type').annotate(count=Count('id')).order_by('-count')
+                for et in entity_types:
+                    self.stdout.write(f'  {et["entity_type"] or "NULL"}: {et["count"]} tasks')
+                
+                # Cluster role breakdown
+                self.stdout.write(f'\n🎯 Cluster Role Breakdown:')
+                roles = tasks.values('cluster_role').annotate(count=Count('id')).order_by('-count')
+                for role in roles:
+                    self.stdout.write(f'  {role["cluster_role"] or "NULL"}: {role["count"]} tasks')
+            
+            self.stdout.write('')
+