Implement Stage 3: Enhance content metadata and validation features

- Added entity metadata fields to the Tasks model, including entity_type, taxonomy, and cluster_role. - Updated CandidateEngine to prioritize content relevance based on cluster mappings. - Introduced metadata completeness scoring in ContentAnalyzer. - Enhanced validation services to check for entity type and mapping completeness. - Updated frontend components to display and validate new metadata fields. - Implemented API endpoints for content validation and metadata persistence. - Migrated existing data to populate new metadata fields for Tasks and Content.
2025-11-19 19:21:30 +00:00
parent 38f6026e73
commit bae9ea47d8
33 changed files with 2388 additions and 73 deletions
--- a/backend/igny8_core/business/linking/services/candidate_engine.py
+++ b/backend/igny8_core/business/linking/services/candidate_engine.py
@@ -40,6 +40,9 @@ class CandidateEngine:
    
    def _find_relevant_content(self, content: Content) -> List[Content]:
        """Find relevant content from same account/site/sector"""
+        # Stage 3: Use cluster mappings for better relevance
+        from igny8_core.business.content.models import ContentClusterMap
+        
        # Get content from same account, site, and sector
        queryset = Content.objects.filter(
            account=content.account,
@@ -48,7 +51,25 @@ class CandidateEngine:
            status__in=['draft', 'review', 'publish']
        ).exclude(id=content.id)
        
-        # Filter by keywords if available
+        # Stage 3: Prioritize content from same cluster
+        content_clusters = ContentClusterMap.objects.filter(
+            content=content
+        ).values_list('cluster_id', flat=True)
+        
+        if content_clusters:
+            # Find content mapped to same clusters
+            cluster_content_ids = ContentClusterMap.objects.filter(
+                cluster_id__in=content_clusters
+            ).exclude(content=content).values_list('content_id', flat=True).distinct()
+            
+            # Prioritize cluster-matched content
+            cluster_matched = queryset.filter(id__in=cluster_content_ids)
+            other_content = queryset.exclude(id__in=cluster_content_ids)
+            
+            # Combine: cluster-matched first, then others
+            return list(cluster_matched[:30]) + list(other_content[:20])
+        
+        # Fallback to keyword-based filtering
        if content.primary_keyword:
            queryset = queryset.filter(
                models.Q(primary_keyword__icontains=content.primary_keyword) |
@@ -59,38 +80,72 @@ class CandidateEngine:
    
    def _score_candidates(self, content: Content, candidates: List[Content]) -> List[Dict]:
        """Score candidates based on relevance"""
+        from igny8_core.business.content.models import ContentClusterMap, ContentTaxonomyMap
+        
+        # Stage 3: Get cluster mappings for content
+        content_clusters = set(
+            ContentClusterMap.objects.filter(content=content)
+            .values_list('cluster_id', flat=True)
+        )
+        content_taxonomies = set(
+            ContentTaxonomyMap.objects.filter(content=content)
+            .values_list('taxonomy_id', flat=True)
+        )
+        
        scored = []
        
        for candidate in candidates:
            score = 0
            
-            # Keyword overlap (higher weight)
+            # Stage 3: Cluster matching (highest priority)
+            candidate_clusters = set(
+                ContentClusterMap.objects.filter(content=candidate)
+                .values_list('cluster_id', flat=True)
+            )
+            cluster_overlap = content_clusters & candidate_clusters
+            if cluster_overlap:
+                score += 50 * len(cluster_overlap)  # High weight for cluster matches
+            
+            # Stage 3: Taxonomy matching
+            candidate_taxonomies = set(
+                ContentTaxonomyMap.objects.filter(content=candidate)
+                .values_list('taxonomy_id', flat=True)
+            )
+            taxonomy_overlap = content_taxonomies & candidate_taxonomies
+            if taxonomy_overlap:
+                score += 20 * len(taxonomy_overlap)
+            
+            # Stage 3: Entity type matching
+            if content.entity_type == candidate.entity_type:
+                score += 15
+            
+            # Keyword overlap (medium weight)
            if content.primary_keyword and candidate.primary_keyword:
                if content.primary_keyword.lower() in candidate.primary_keyword.lower():
-                    score += 30
+                    score += 20
                if candidate.primary_keyword.lower() in content.primary_keyword.lower():
-                    score += 30
+                    score += 20
            
            # Secondary keywords overlap
            if content.secondary_keywords and candidate.secondary_keywords:
                overlap = set(content.secondary_keywords) & set(candidate.secondary_keywords)
-                score += len(overlap) * 10
+                score += len(overlap) * 5
            
            # Category overlap
            if content.categories and candidate.categories:
                overlap = set(content.categories) & set(candidate.categories)
-                score += len(overlap) * 5
+                score += len(overlap) * 3
            
            # Tag overlap
            if content.tags and candidate.tags:
                overlap = set(content.tags) & set(candidate.tags)
-                score += len(overlap) * 3
+                score += len(overlap) * 2
            
            # Recency bonus (newer content gets slight boost)
            if candidate.generated_at:
                days_old = (content.generated_at - candidate.generated_at).days
                if days_old < 30:
-                    score += 5
+                    score += 3
            
            if score > 0:
                scored.append({
@@ -98,6 +153,8 @@ class CandidateEngine:
                    'title': candidate.title or candidate.task.title if candidate.task else 'Untitled',
                    'url': f"/content/{candidate.id}/",  # Placeholder - actual URL depends on routing
                    'relevance_score': score,
+                    'cluster_match': len(cluster_overlap) > 0,  # Stage 3: Flag cluster matches
+                    'taxonomy_match': len(taxonomy_overlap) > 0,  # Stage 3: Flag taxonomy matches
                    'anchor_text': self._generate_anchor_text(candidate, content)
                })