Implement Stage 3: Enhance content metadata and validation features
- Added entity metadata fields to the Tasks model, including entity_type, taxonomy, and cluster_role. - Updated CandidateEngine to prioritize content relevance based on cluster mappings. - Introduced metadata completeness scoring in ContentAnalyzer. - Enhanced validation services to check for entity type and mapping completeness. - Updated frontend components to display and validate new metadata fields. - Implemented API endpoints for content validation and metadata persistence. - Migrated existing data to populate new metadata fields for Tasks and Content.
This commit is contained in:
@@ -40,6 +40,9 @@ class CandidateEngine:
|
||||
|
||||
def _find_relevant_content(self, content: Content) -> List[Content]:
|
||||
"""Find relevant content from same account/site/sector"""
|
||||
# Stage 3: Use cluster mappings for better relevance
|
||||
from igny8_core.business.content.models import ContentClusterMap
|
||||
|
||||
# Get content from same account, site, and sector
|
||||
queryset = Content.objects.filter(
|
||||
account=content.account,
|
||||
@@ -48,7 +51,25 @@ class CandidateEngine:
|
||||
status__in=['draft', 'review', 'publish']
|
||||
).exclude(id=content.id)
|
||||
|
||||
# Filter by keywords if available
|
||||
# Stage 3: Prioritize content from same cluster
|
||||
content_clusters = ContentClusterMap.objects.filter(
|
||||
content=content
|
||||
).values_list('cluster_id', flat=True)
|
||||
|
||||
if content_clusters:
|
||||
# Find content mapped to same clusters
|
||||
cluster_content_ids = ContentClusterMap.objects.filter(
|
||||
cluster_id__in=content_clusters
|
||||
).exclude(content=content).values_list('content_id', flat=True).distinct()
|
||||
|
||||
# Prioritize cluster-matched content
|
||||
cluster_matched = queryset.filter(id__in=cluster_content_ids)
|
||||
other_content = queryset.exclude(id__in=cluster_content_ids)
|
||||
|
||||
# Combine: cluster-matched first, then others
|
||||
return list(cluster_matched[:30]) + list(other_content[:20])
|
||||
|
||||
# Fallback to keyword-based filtering
|
||||
if content.primary_keyword:
|
||||
queryset = queryset.filter(
|
||||
models.Q(primary_keyword__icontains=content.primary_keyword) |
|
||||
@@ -59,38 +80,72 @@ class CandidateEngine:
|
||||
|
||||
def _score_candidates(self, content: Content, candidates: List[Content]) -> List[Dict]:
|
||||
"""Score candidates based on relevance"""
|
||||
from igny8_core.business.content.models import ContentClusterMap, ContentTaxonomyMap
|
||||
|
||||
# Stage 3: Get cluster mappings for content
|
||||
content_clusters = set(
|
||||
ContentClusterMap.objects.filter(content=content)
|
||||
.values_list('cluster_id', flat=True)
|
||||
)
|
||||
content_taxonomies = set(
|
||||
ContentTaxonomyMap.objects.filter(content=content)
|
||||
.values_list('taxonomy_id', flat=True)
|
||||
)
|
||||
|
||||
scored = []
|
||||
|
||||
for candidate in candidates:
|
||||
score = 0
|
||||
|
||||
# Keyword overlap (higher weight)
|
||||
# Stage 3: Cluster matching (highest priority)
|
||||
candidate_clusters = set(
|
||||
ContentClusterMap.objects.filter(content=candidate)
|
||||
.values_list('cluster_id', flat=True)
|
||||
)
|
||||
cluster_overlap = content_clusters & candidate_clusters
|
||||
if cluster_overlap:
|
||||
score += 50 * len(cluster_overlap) # High weight for cluster matches
|
||||
|
||||
# Stage 3: Taxonomy matching
|
||||
candidate_taxonomies = set(
|
||||
ContentTaxonomyMap.objects.filter(content=candidate)
|
||||
.values_list('taxonomy_id', flat=True)
|
||||
)
|
||||
taxonomy_overlap = content_taxonomies & candidate_taxonomies
|
||||
if taxonomy_overlap:
|
||||
score += 20 * len(taxonomy_overlap)
|
||||
|
||||
# Stage 3: Entity type matching
|
||||
if content.entity_type == candidate.entity_type:
|
||||
score += 15
|
||||
|
||||
# Keyword overlap (medium weight)
|
||||
if content.primary_keyword and candidate.primary_keyword:
|
||||
if content.primary_keyword.lower() in candidate.primary_keyword.lower():
|
||||
score += 30
|
||||
score += 20
|
||||
if candidate.primary_keyword.lower() in content.primary_keyword.lower():
|
||||
score += 30
|
||||
score += 20
|
||||
|
||||
# Secondary keywords overlap
|
||||
if content.secondary_keywords and candidate.secondary_keywords:
|
||||
overlap = set(content.secondary_keywords) & set(candidate.secondary_keywords)
|
||||
score += len(overlap) * 10
|
||||
score += len(overlap) * 5
|
||||
|
||||
# Category overlap
|
||||
if content.categories and candidate.categories:
|
||||
overlap = set(content.categories) & set(candidate.categories)
|
||||
score += len(overlap) * 5
|
||||
score += len(overlap) * 3
|
||||
|
||||
# Tag overlap
|
||||
if content.tags and candidate.tags:
|
||||
overlap = set(content.tags) & set(candidate.tags)
|
||||
score += len(overlap) * 3
|
||||
score += len(overlap) * 2
|
||||
|
||||
# Recency bonus (newer content gets slight boost)
|
||||
if candidate.generated_at:
|
||||
days_old = (content.generated_at - candidate.generated_at).days
|
||||
if days_old < 30:
|
||||
score += 5
|
||||
score += 3
|
||||
|
||||
if score > 0:
|
||||
scored.append({
|
||||
@@ -98,6 +153,8 @@ class CandidateEngine:
|
||||
'title': candidate.title or candidate.task.title if candidate.task else 'Untitled',
|
||||
'url': f"/content/{candidate.id}/", # Placeholder - actual URL depends on routing
|
||||
'relevance_score': score,
|
||||
'cluster_match': len(cluster_overlap) > 0, # Stage 3: Flag cluster matches
|
||||
'taxonomy_match': len(taxonomy_overlap) > 0, # Stage 3: Flag taxonomy matches
|
||||
'anchor_text': self._generate_anchor_text(candidate, content)
|
||||
})
|
||||
|
||||
|
||||
Reference in New Issue
Block a user