Implement Stage 3: Enhance content metadata and validation features

- Added entity metadata fields to the Tasks model, including entity_type, taxonomy, and cluster_role. - Updated CandidateEngine to prioritize content relevance based on cluster mappings. - Introduced metadata completeness scoring in ContentAnalyzer. - Enhanced validation services to check for entity type and mapping completeness. - Updated frontend components to display and validate new metadata fields. - Implemented API endpoints for content validation and metadata persistence. - Migrated existing data to populate new metadata fields for Tasks and Content.
2025-11-19 19:21:30 +00:00
parent 38f6026e73
commit bae9ea47d8
33 changed files with 2388 additions and 73 deletions
--- a/backend/igny8_core/business/content/models.py
+++ b/backend/igny8_core/business/content/models.py
@@ -53,6 +53,46 @@ class Tasks(SiteSectorBaseModel):
    content_type = models.CharField(max_length=50, choices=CONTENT_TYPE_CHOICES, default='blog_post')
    status = models.CharField(max_length=50, choices=STATUS_CHOICES, default='queued')
    
+    # Stage 3: Entity metadata fields
+    ENTITY_TYPE_CHOICES = [
+        ('blog_post', 'Blog Post'),
+        ('article', 'Article'),
+        ('product', 'Product'),
+        ('service', 'Service Page'),
+        ('taxonomy', 'Taxonomy Page'),
+        ('page', 'Page'),
+    ]
+    CLUSTER_ROLE_CHOICES = [
+        ('hub', 'Hub Page'),
+        ('supporting', 'Supporting Page'),
+        ('attribute', 'Attribute Page'),
+    ]
+    entity_type = models.CharField(
+        max_length=50,
+        choices=ENTITY_TYPE_CHOICES,
+        default='blog_post',
+        db_index=True,
+        blank=True,
+        null=True,
+        help_text="Type of content entity (inherited from idea/blueprint)"
+    )
+    taxonomy = models.ForeignKey(
+        'site_building.SiteBlueprintTaxonomy',
+        on_delete=models.SET_NULL,
+        null=True,
+        blank=True,
+        related_name='tasks',
+        help_text="Taxonomy association when derived from blueprint planning"
+    )
+    cluster_role = models.CharField(
+        max_length=50,
+        choices=CLUSTER_ROLE_CHOICES,
+        default='hub',
+        blank=True,
+        null=True,
+        help_text="Role within the cluster-driven sitemap"
+    )
+    
    # Content fields
    content = models.TextField(blank=True, null=True)  # Generated content
    word_count = models.IntegerField(default=0)
@@ -78,6 +118,8 @@ class Tasks(SiteSectorBaseModel):
            models.Index(fields=['status']),
            models.Index(fields=['cluster']),
            models.Index(fields=['content_type']),
+            models.Index(fields=['entity_type']),
+            models.Index(fields=['cluster_role']),
            models.Index(fields=['site', 'sector']),
        ]
    
--- a/backend/igny8_core/business/content/services/metadata_mapping_service.py
+++ b/backend/igny8_core/business/content/services/metadata_mapping_service.py
@@ -0,0 +1,116 @@
+"""
+Metadata Mapping Service
+Stage 3: Persists cluster/taxonomy/attribute mappings from Tasks to Content
+"""
+import logging
+from typing import Optional
+from django.db import transaction
+
+from igny8_core.business.content.models import (
+    Tasks,
+    Content,
+    ContentClusterMap,
+    ContentTaxonomyMap,
+    ContentAttributeMap,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class MetadataMappingService:
+    """Service for persisting metadata mappings from Tasks to Content"""
+    
+    @transaction.atomic
+    def persist_task_metadata_to_content(self, content: Content) -> None:
+        """
+        Persist cluster/taxonomy/attribute mappings from Task to Content.
+        
+        Args:
+            content: Content instance with an associated task
+        """
+        if not content.task:
+            logger.warning(f"Content {content.id} has no associated task, skipping metadata mapping")
+            return
+        
+        task = content.task
+        
+        # Stage 3: Persist cluster mapping if task has cluster
+        if task.cluster:
+            ContentClusterMap.objects.get_or_create(
+                content=content,
+                cluster=task.cluster,
+                role=task.cluster_role or 'hub',
+                defaults={
+                    'account': content.account,
+                    'site': content.site,
+                    'sector': content.sector,
+                    'source': 'blueprint' if task.idea else 'manual',
+                    'metadata': {},
+                }
+            )
+            logger.info(f"Created cluster mapping for content {content.id} -> cluster {task.cluster.id}")
+        
+        # Stage 3: Persist taxonomy mapping if task has taxonomy
+        if task.taxonomy:
+            ContentTaxonomyMap.objects.get_or_create(
+                content=content,
+                taxonomy=task.taxonomy,
+                defaults={
+                    'account': content.account,
+                    'site': content.site,
+                    'sector': content.sector,
+                    'source': 'blueprint',
+                    'metadata': {},
+                }
+            )
+            logger.info(f"Created taxonomy mapping for content {content.id} -> taxonomy {task.taxonomy.id}")
+        
+        # Stage 3: Inherit entity_type from task
+        if task.entity_type and not content.entity_type:
+            content.entity_type = task.entity_type
+            content.save(update_fields=['entity_type'])
+            logger.info(f"Set entity_type {task.entity_type} for content {content.id}")
+        
+        # Stage 3: Extract attributes from task metadata if available
+        # This can be extended to parse task.description or task.metadata for attributes
+        # For now, we'll rely on explicit attribute data in future enhancements
+    
+    @transaction.atomic
+    def backfill_content_metadata(self, content: Content) -> None:
+        """
+        Backfill metadata mappings for existing content that may be missing mappings.
+        
+        Args:
+            content: Content instance to backfill
+        """
+        # If content already has mappings, skip
+        if ContentClusterMap.objects.filter(content=content).exists():
+            return
+        
+        # Try to infer from task
+        if content.task:
+            self.persist_task_metadata_to_content(content)
+            return
+        
+        # Try to infer from content metadata
+        if content.metadata:
+            cluster_id = content.metadata.get('cluster_id')
+            if cluster_id:
+                from igny8_core.business.planning.models import Clusters
+                try:
+                    cluster = Clusters.objects.get(id=cluster_id)
+                    ContentClusterMap.objects.get_or_create(
+                        content=content,
+                        cluster=cluster,
+                        role='hub',  # Default
+                        defaults={
+                            'account': content.account,
+                            'site': content.site,
+                            'sector': content.sector,
+                            'source': 'manual',
+                            'metadata': {},
+                        }
+                    )
+                except Clusters.DoesNotExist:
+                    logger.warning(f"Cluster {cluster_id} not found for content {content.id}")
+
--- a/backend/igny8_core/business/content/services/validation_service.py
+++ b/backend/igny8_core/business/content/services/validation_service.py
@@ -0,0 +1,170 @@
+"""
+Content Validation Service
+Stage 3: Validates content metadata before publish
+"""
+import logging
+from typing import List, Dict, Optional
+from django.core.exceptions import ValidationError
+
+from igny8_core.business.content.models import Tasks, Content
+
+logger = logging.getLogger(__name__)
+
+
+class ContentValidationService:
+    """Service for validating content metadata requirements"""
+    
+    def validate_task(self, task: Tasks) -> List[Dict[str, str]]:
+        """
+        Validate a task has required metadata.
+        
+        Args:
+            task: Task instance to validate
+            
+        Returns:
+            List of validation errors (empty if valid)
+        """
+        errors = []
+        
+        # Stage 3: Enforce "no cluster, no task" rule when feature flag enabled
+        from django.conf import settings
+        if getattr(settings, 'USE_SITE_BUILDER_REFACTOR', False):
+            if not task.cluster:
+                errors.append({
+                    'field': 'cluster',
+                    'code': 'missing_cluster',
+                    'message': 'Task must be associated with a cluster before content generation',
+                })
+        
+        # Stage 3: Validate entity_type is set
+        if not task.entity_type:
+            errors.append({
+                'field': 'entity_type',
+                'code': 'missing_entity_type',
+                'message': 'Task must have an entity type specified',
+            })
+        
+        # Stage 3: Validate taxonomy for product/service entities
+        if task.entity_type in ['product', 'service']:
+            if not task.taxonomy:
+                errors.append({
+                    'field': 'taxonomy',
+                    'code': 'missing_taxonomy',
+                    'message': f'{task.entity_type.title()} tasks require a taxonomy association',
+                })
+        
+        return errors
+    
+    def validate_content(self, content: Content) -> List[Dict[str, str]]:
+        """
+        Validate content has required metadata before publish.
+        
+        Args:
+            content: Content instance to validate
+            
+        Returns:
+            List of validation errors (empty if valid)
+        """
+        errors = []
+        
+        # Stage 3: Validate entity_type
+        if not content.entity_type:
+            errors.append({
+                'field': 'entity_type',
+                'code': 'missing_entity_type',
+                'message': 'Content must have an entity type specified',
+            })
+        
+        # Stage 3: Validate cluster mapping exists for IGNY8 content
+        if content.source == 'igny8':
+            from igny8_core.business.content.models import ContentClusterMap
+            if not ContentClusterMap.objects.filter(content=content).exists():
+                errors.append({
+                    'field': 'cluster_mapping',
+                    'code': 'missing_cluster_mapping',
+                    'message': 'Content must be mapped to at least one cluster',
+                })
+        
+        # Stage 3: Validate taxonomy for product/service content
+        if content.entity_type in ['product', 'service']:
+            from igny8_core.business.content.models import ContentTaxonomyMap
+            if not ContentTaxonomyMap.objects.filter(content=content).exists():
+                errors.append({
+                    'field': 'taxonomy_mapping',
+                    'code': 'missing_taxonomy_mapping',
+                    'message': f'{content.entity_type.title()} content requires a taxonomy mapping',
+                })
+        
+        # Stage 3: Validate required attributes for products
+        if content.entity_type == 'product':
+            from igny8_core.business.content.models import ContentAttributeMap
+            required_attrs = ['price', 'sku', 'category']
+            existing_attrs = ContentAttributeMap.objects.filter(
+                content=content,
+                name__in=required_attrs
+            ).values_list('name', flat=True)
+            missing_attrs = set(required_attrs) - set(existing_attrs)
+            if missing_attrs:
+                errors.append({
+                    'field': 'attributes',
+                    'code': 'missing_attributes',
+                    'message': f'Product content requires attributes: {", ".join(missing_attrs)}',
+                })
+        
+        return errors
+    
+    def validate_for_publish(self, content: Content) -> List[Dict[str, str]]:
+        """
+        Comprehensive validation before publishing content.
+        
+        Args:
+            content: Content instance to validate
+            
+        Returns:
+            List of validation errors (empty if ready to publish)
+        """
+        errors = []
+        
+        # Basic content validation
+        errors.extend(self.validate_content(content))
+        
+        # Additional publish requirements
+        if not content.title:
+            errors.append({
+                'field': 'title',
+                'code': 'missing_title',
+                'message': 'Content must have a title before publishing',
+            })
+        
+        if not content.html_content or len(content.html_content.strip()) < 100:
+            errors.append({
+                'field': 'html_content',
+                'code': 'insufficient_content',
+                'message': 'Content must have at least 100 characters before publishing',
+            })
+        
+        return errors
+    
+    def ensure_required_attributes(self, task: Tasks) -> List[Dict[str, str]]:
+        """
+        Check if task has required attributes based on entity type.
+        
+        Args:
+            task: Task instance to check
+            
+        Returns:
+            List of missing attribute errors
+        """
+        errors = []
+        
+        if task.entity_type == 'product':
+            # Products should have taxonomy and cluster
+            if not task.taxonomy:
+                errors.append({
+                    'field': 'taxonomy',
+                    'code': 'missing_taxonomy',
+                    'message': 'Product tasks require a taxonomy (product category)',
+                })
+        
+        return errors
+
--- a/backend/igny8_core/business/linking/services/candidate_engine.py
+++ b/backend/igny8_core/business/linking/services/candidate_engine.py
@@ -40,6 +40,9 @@ class CandidateEngine:
    
    def _find_relevant_content(self, content: Content) -> List[Content]:
        """Find relevant content from same account/site/sector"""
+        # Stage 3: Use cluster mappings for better relevance
+        from igny8_core.business.content.models import ContentClusterMap
+        
        # Get content from same account, site, and sector
        queryset = Content.objects.filter(
            account=content.account,
@@ -48,7 +51,25 @@ class CandidateEngine:
            status__in=['draft', 'review', 'publish']
        ).exclude(id=content.id)
        
-        # Filter by keywords if available
+        # Stage 3: Prioritize content from same cluster
+        content_clusters = ContentClusterMap.objects.filter(
+            content=content
+        ).values_list('cluster_id', flat=True)
+        
+        if content_clusters:
+            # Find content mapped to same clusters
+            cluster_content_ids = ContentClusterMap.objects.filter(
+                cluster_id__in=content_clusters
+            ).exclude(content=content).values_list('content_id', flat=True).distinct()
+            
+            # Prioritize cluster-matched content
+            cluster_matched = queryset.filter(id__in=cluster_content_ids)
+            other_content = queryset.exclude(id__in=cluster_content_ids)
+            
+            # Combine: cluster-matched first, then others
+            return list(cluster_matched[:30]) + list(other_content[:20])
+        
+        # Fallback to keyword-based filtering
        if content.primary_keyword:
            queryset = queryset.filter(
                models.Q(primary_keyword__icontains=content.primary_keyword) |
@@ -59,38 +80,72 @@ class CandidateEngine:
    
    def _score_candidates(self, content: Content, candidates: List[Content]) -> List[Dict]:
        """Score candidates based on relevance"""
+        from igny8_core.business.content.models import ContentClusterMap, ContentTaxonomyMap
+        
+        # Stage 3: Get cluster mappings for content
+        content_clusters = set(
+            ContentClusterMap.objects.filter(content=content)
+            .values_list('cluster_id', flat=True)
+        )
+        content_taxonomies = set(
+            ContentTaxonomyMap.objects.filter(content=content)
+            .values_list('taxonomy_id', flat=True)
+        )
+        
        scored = []
        
        for candidate in candidates:
            score = 0
            
-            # Keyword overlap (higher weight)
+            # Stage 3: Cluster matching (highest priority)
+            candidate_clusters = set(
+                ContentClusterMap.objects.filter(content=candidate)
+                .values_list('cluster_id', flat=True)
+            )
+            cluster_overlap = content_clusters & candidate_clusters
+            if cluster_overlap:
+                score += 50 * len(cluster_overlap)  # High weight for cluster matches
+            
+            # Stage 3: Taxonomy matching
+            candidate_taxonomies = set(
+                ContentTaxonomyMap.objects.filter(content=candidate)
+                .values_list('taxonomy_id', flat=True)
+            )
+            taxonomy_overlap = content_taxonomies & candidate_taxonomies
+            if taxonomy_overlap:
+                score += 20 * len(taxonomy_overlap)
+            
+            # Stage 3: Entity type matching
+            if content.entity_type == candidate.entity_type:
+                score += 15
+            
+            # Keyword overlap (medium weight)
            if content.primary_keyword and candidate.primary_keyword:
                if content.primary_keyword.lower() in candidate.primary_keyword.lower():
-                    score += 30
+                    score += 20
                if candidate.primary_keyword.lower() in content.primary_keyword.lower():
-                    score += 30
+                    score += 20
            
            # Secondary keywords overlap
            if content.secondary_keywords and candidate.secondary_keywords:
                overlap = set(content.secondary_keywords) & set(candidate.secondary_keywords)
-                score += len(overlap) * 10
+                score += len(overlap) * 5
            
            # Category overlap
            if content.categories and candidate.categories:
                overlap = set(content.categories) & set(candidate.categories)
-                score += len(overlap) * 5
+                score += len(overlap) * 3
            
            # Tag overlap
            if content.tags and candidate.tags:
                overlap = set(content.tags) & set(candidate.tags)
-                score += len(overlap) * 3
+                score += len(overlap) * 2
            
            # Recency bonus (newer content gets slight boost)
            if candidate.generated_at:
                days_old = (content.generated_at - candidate.generated_at).days
                if days_old < 30:
-                    score += 5
+                    score += 3
            
            if score > 0:
                scored.append({
@@ -98,6 +153,8 @@ class CandidateEngine:
                    'title': candidate.title or candidate.task.title if candidate.task else 'Untitled',
                    'url': f"/content/{candidate.id}/",  # Placeholder - actual URL depends on routing
                    'relevance_score': score,
+                    'cluster_match': len(cluster_overlap) > 0,  # Stage 3: Flag cluster matches
+                    'taxonomy_match': len(taxonomy_overlap) > 0,  # Stage 3: Flag taxonomy matches
                    'anchor_text': self._generate_anchor_text(candidate, content)
                })
        
--- a/backend/igny8_core/business/optimization/services/analyzer.py
+++ b/backend/igny8_core/business/optimization/services/analyzer.py
@@ -35,25 +35,77 @@ class ContentAnalyzer:
        readability_score = self._calculate_readability_score(content)
        engagement_score = self._calculate_engagement_score(content)
        
-        # Overall score is weighted average
+        # Stage 3: Calculate metadata completeness score
+        metadata_score = self._calculate_metadata_score(content)
+        
+        # Overall score is weighted average (includes metadata)
        overall_score = (
-            seo_score * 0.4 +
-            readability_score * 0.3 +
-            engagement_score * 0.3
+            seo_score * 0.35 +
+            readability_score * 0.25 +
+            engagement_score * 0.25 +
+            metadata_score * 0.15
        )
        
        return {
            'seo_score': round(seo_score, 2),
            'readability_score': round(readability_score, 2),
            'engagement_score': round(engagement_score, 2),
+            'metadata_score': round(metadata_score, 2),  # Stage 3: Add metadata score
            'overall_score': round(overall_score, 2),
            'word_count': content.word_count or 0,
            'has_meta_title': bool(content.meta_title),
            'has_meta_description': bool(content.meta_description),
            'has_primary_keyword': bool(content.primary_keyword),
-            'internal_links_count': len(content.internal_links) if content.internal_links else 0
+            'internal_links_count': len(content.internal_links) if content.internal_links else 0,
+            # Stage 3: Metadata completeness indicators
+            'has_entity_type': bool(content.entity_type),
+            'has_cluster_mapping': self._has_cluster_mapping(content),
+            'has_taxonomy_mapping': self._has_taxonomy_mapping(content),
        }
    
+    def _calculate_metadata_score(self, content: Content) -> float:
+        """Stage 3: Calculate metadata completeness score (0-100)"""
+        score = 0
+        
+        # Entity type (20 points)
+        if content.entity_type:
+            score += 20
+        
+        # Cluster mapping (30 points)
+        if self._has_cluster_mapping(content):
+            score += 30
+        
+        # Taxonomy mapping (30 points) - required for products/services
+        if self._has_taxonomy_mapping(content):
+            score += 30
+        elif content.entity_type in ['product', 'service']:
+            # Products/services must have taxonomy
+            score += 0
+        else:
+            # Other types get partial credit
+            score += 15
+        
+        # Attributes (20 points) - for products
+        if content.entity_type == 'product':
+            from igny8_core.business.content.models import ContentAttributeMap
+            attr_count = ContentAttributeMap.objects.filter(content=content).count()
+            if attr_count >= 3:
+                score += 20
+            elif attr_count >= 1:
+                score += 10
+        
+        return min(score, 100)
+    
+    def _has_cluster_mapping(self, content: Content) -> bool:
+        """Stage 3: Check if content has cluster mapping"""
+        from igny8_core.business.content.models import ContentClusterMap
+        return ContentClusterMap.objects.filter(content=content).exists()
+    
+    def _has_taxonomy_mapping(self, content: Content) -> bool:
+        """Stage 3: Check if content has taxonomy mapping"""
+        from igny8_core.business.content.models import ContentTaxonomyMap
+        return ContentTaxonomyMap.objects.filter(content=content).exists()
+    
    def _calculate_seo_score(self, content: Content) -> float:
        """Calculate SEO score (0-100)"""
        score = 0
--- a/backend/igny8_core/business/site_building/services/page_generation_service.py
+++ b/backend/igny8_core/business/site_building/services/page_generation_service.py
@@ -225,6 +225,38 @@ class PageGenerationService:

        keywords = self._build_keywords_hint(page_blueprint)

+        # Stage 3: Map page type to entity_type
+        entity_type_map = {
+            'home': 'page',
+            'about': 'page',
+            'services': 'service',
+            'products': 'product',
+            'blog': 'blog_post',
+            'contact': 'page',
+            'custom': 'page',
+        }
+        entity_type = entity_type_map.get(page_blueprint.type, 'page')
+        
+        # Stage 3: Try to find related cluster and taxonomy from blueprint
+        cluster_role = 'hub'  # Default
+        taxonomy = None
+        
+        # Find cluster link for this blueprint to infer role
+        from igny8_core.business.site_building.models import SiteBlueprintCluster
+        cluster_link = SiteBlueprintCluster.objects.filter(
+            site_blueprint=page_blueprint.site_blueprint
+        ).first()
+        if cluster_link:
+            cluster_role = cluster_link.role
+        
+        # Find taxonomy if page type suggests it (products/services)
+        if page_blueprint.type in ['products', 'services']:
+            from igny8_core.business.site_building.models import SiteBlueprintTaxonomy
+            taxonomy = SiteBlueprintTaxonomy.objects.filter(
+                site_blueprint=page_blueprint.site_blueprint,
+                taxonomy_type__in=['product_category', 'service_category']
+            ).first()
+        
        task = Tasks.objects.create(
            account=page_blueprint.account,
            site=page_blueprint.site,
@@ -235,6 +267,10 @@ class PageGenerationService:
            content_structure=self._map_content_structure(page_blueprint.type),
            content_type='article',
            status='queued',
+            # Stage 3: Set entity metadata
+            entity_type=entity_type,
+            taxonomy=taxonomy,
+            cluster_role=cluster_role,
        )

        logger.info(