Add source tracking and sync status fields to Content model; update services module

- Introduced new fields in the Content model for source tracking and sync status, including external references and optimization fields. - Updated the services module to include new content generation and pipeline services for better organization and clarity.
2025-11-17 11:15:15 +00:00
parent fe95d09bbe
commit 9930728e8a
19 changed files with 2281 additions and 1 deletions
--- a/backend/igny8_core/business/linking/services/init.py
+++ b/backend/igny8_core/business/linking/services/init.py
@@ -0,0 +1,5 @@
+"""
+Linking Services
+"""
+
+
--- a/backend/igny8_core/business/linking/services/candidate_engine.py
+++ b/backend/igny8_core/business/linking/services/candidate_engine.py
@@ -0,0 +1,117 @@
+"""
+Link Candidate Engine
+Finds relevant content for internal linking
+"""
+import logging
+from typing import List, Dict
+from django.db import models
+from igny8_core.business.content.models import Content
+
+logger = logging.getLogger(__name__)
+
+
+class CandidateEngine:
+    """Finds link candidates for content"""
+    
+    def find_candidates(self, content: Content, max_candidates: int = 10) -> List[Dict]:
+        """
+        Find link candidates for a piece of content.
+        
+        Args:
+            content: Content instance to find links for
+            max_candidates: Maximum number of candidates to return
+        
+        Returns:
+            List of candidate dicts with: {'content_id', 'title', 'url', 'relevance_score', 'anchor_text'}
+        """
+        if not content or not content.html_content:
+            return []
+        
+        # Find relevant content from same account/site/sector
+        relevant_content = self._find_relevant_content(content)
+        
+        # Score candidates based on relevance
+        candidates = self._score_candidates(content, relevant_content)
+        
+        # Sort by score and return top candidates
+        candidates.sort(key=lambda x: x.get('relevance_score', 0), reverse=True)
+        
+        return candidates[:max_candidates]
+    
+    def _find_relevant_content(self, content: Content) -> List[Content]:
+        """Find relevant content from same account/site/sector"""
+        # Get content from same account, site, and sector
+        queryset = Content.objects.filter(
+            account=content.account,
+            site=content.site,
+            sector=content.sector,
+            status__in=['draft', 'review', 'publish']
+        ).exclude(id=content.id)
+        
+        # Filter by keywords if available
+        if content.primary_keyword:
+            queryset = queryset.filter(
+                models.Q(primary_keyword__icontains=content.primary_keyword) |
+                models.Q(secondary_keywords__icontains=content.primary_keyword)
+            )
+        
+        return list(queryset[:50])  # Limit initial query
+    
+    def _score_candidates(self, content: Content, candidates: List[Content]) -> List[Dict]:
+        """Score candidates based on relevance"""
+        scored = []
+        
+        for candidate in candidates:
+            score = 0
+            
+            # Keyword overlap (higher weight)
+            if content.primary_keyword and candidate.primary_keyword:
+                if content.primary_keyword.lower() in candidate.primary_keyword.lower():
+                    score += 30
+                if candidate.primary_keyword.lower() in content.primary_keyword.lower():
+                    score += 30
+            
+            # Secondary keywords overlap
+            if content.secondary_keywords and candidate.secondary_keywords:
+                overlap = set(content.secondary_keywords) & set(candidate.secondary_keywords)
+                score += len(overlap) * 10
+            
+            # Category overlap
+            if content.categories and candidate.categories:
+                overlap = set(content.categories) & set(candidate.categories)
+                score += len(overlap) * 5
+            
+            # Tag overlap
+            if content.tags and candidate.tags:
+                overlap = set(content.tags) & set(candidate.tags)
+                score += len(overlap) * 3
+            
+            # Recency bonus (newer content gets slight boost)
+            if candidate.generated_at:
+                days_old = (content.generated_at - candidate.generated_at).days
+                if days_old < 30:
+                    score += 5
+            
+            if score > 0:
+                scored.append({
+                    'content_id': candidate.id,
+                    'title': candidate.title or candidate.task.title if candidate.task else 'Untitled',
+                    'url': f"/content/{candidate.id}/",  # Placeholder - actual URL depends on routing
+                    'relevance_score': score,
+                    'anchor_text': self._generate_anchor_text(candidate, content)
+                })
+        
+        return scored
+    
+    def _generate_anchor_text(self, candidate: Content, source_content: Content) -> str:
+        """Generate anchor text for link"""
+        # Use primary keyword if available, otherwise use title
+        if candidate.primary_keyword:
+            return candidate.primary_keyword
+        elif candidate.title:
+            return candidate.title
+        elif candidate.task and candidate.task.title:
+            return candidate.task.title
+        else:
+            return "Learn more"
+
--- a/backend/igny8_core/business/linking/services/injection_engine.py
+++ b/backend/igny8_core/business/linking/services/injection_engine.py
@@ -0,0 +1,73 @@
+"""
+Link Injection Engine
+Injects internal links into content HTML
+"""
+import logging
+import re
+from typing import List, Dict
+from igny8_core.business.content.models import Content
+
+logger = logging.getLogger(__name__)
+
+
+class InjectionEngine:
+    """Injects links into content HTML"""
+    
+    def inject_links(self, content: Content, candidates: List[Dict], max_links: int = 5) -> Dict:
+        """
+        Inject links into content HTML.
+        
+        Args:
+            content: Content instance
+            candidates: List of link candidates from CandidateEngine
+            max_links: Maximum number of links to inject
+        
+        Returns:
+            Dict with: {'html_content', 'links', 'links_added'}
+        """
+        if not content.html_content or not candidates:
+            return {
+                'html_content': content.html_content,
+                'links': [],
+                'links_added': 0
+            }
+        
+        html = content.html_content
+        links_added = []
+        links_used = set()  # Track which candidates we've used
+        
+        # Sort candidates by relevance score
+        sorted_candidates = sorted(candidates, key=lambda x: x.get('relevance_score', 0), reverse=True)
+        
+        # Inject links (limit to max_links)
+        for candidate in sorted_candidates[:max_links]:
+            if candidate['content_id'] in links_used:
+                continue
+            
+            anchor_text = candidate.get('anchor_text', 'Learn more')
+            url = candidate.get('url', f"/content/{candidate['content_id']}/")
+            
+            # Find first occurrence of anchor text in HTML (case-insensitive)
+            pattern = re.compile(re.escape(anchor_text), re.IGNORECASE)
+            match = pattern.search(html)
+            
+            if match:
+                # Replace with link
+                link_html = f'<a href="{url}" class="internal-link">{anchor_text}</a>'
+                html = html[:match.start()] + link_html + html[match.end():]
+                
+                links_added.append({
+                    'content_id': candidate['content_id'],
+                    'anchor_text': anchor_text,
+                    'url': url,
+                    'position': match.start()
+                })
+                links_used.add(candidate['content_id'])
+        
+        return {
+            'html_content': html,
+            'links': links_added,
+            'links_added': len(links_added)
+        }
+
+
--- a/backend/igny8_core/business/linking/services/linker_service.py
+++ b/backend/igny8_core/business/linking/services/linker_service.py
@@ -0,0 +1,101 @@
+"""
+Linker Service
+Main service for processing content for internal linking
+"""
+import logging
+from typing import List
+from igny8_core.business.content.models import Content
+from igny8_core.business.linking.services.candidate_engine import CandidateEngine
+from igny8_core.business.linking.services.injection_engine import InjectionEngine
+from igny8_core.business.billing.services.credit_service import CreditService
+from igny8_core.business.billing.exceptions import InsufficientCreditsError
+
+logger = logging.getLogger(__name__)
+
+
+class LinkerService:
+    """Service for processing content for internal linking"""
+    
+    def __init__(self):
+        self.candidate_engine = CandidateEngine()
+        self.injection_engine = InjectionEngine()
+        self.credit_service = CreditService()
+    
+    def process(self, content_id: int) -> Content:
+        """
+        Process content for linking.
+        
+        Args:
+            content_id: Content ID to process
+        
+        Returns:
+            Updated Content instance
+        
+        Raises:
+            InsufficientCreditsError: If account doesn't have enough credits
+        """
+        try:
+            content = Content.objects.get(id=content_id)
+        except Content.DoesNotExist:
+            raise ValueError(f"Content with id {content_id} does not exist")
+        
+        account = content.account
+        
+        # Check credits
+        try:
+            self.credit_service.check_credits(account, 'linking')
+        except InsufficientCreditsError:
+            raise
+        
+        # Find link candidates
+        candidates = self.candidate_engine.find_candidates(content)
+        
+        if not candidates:
+            logger.info(f"No link candidates found for content {content_id}")
+            return content
+        
+        # Inject links
+        result = self.injection_engine.inject_links(content, candidates)
+        
+        # Update content
+        content.html_content = result['html_content']
+        content.internal_links = result['links']
+        content.linker_version += 1
+        content.save(update_fields=['html_content', 'internal_links', 'linker_version'])
+        
+        # Deduct credits
+        self.credit_service.deduct_credits_for_operation(
+            account=account,
+            operation_type='linking',
+            description=f"Internal linking for content: {content.title or 'Untitled'}",
+            related_object_type='content',
+            related_object_id=content.id
+        )
+        
+        logger.info(f"Linked content {content_id}: {result['links_added']} links added")
+        
+        return content
+    
+    def batch_process(self, content_ids: List[int]) -> List[Content]:
+        """
+        Process multiple content items for linking.
+        
+        Args:
+            content_ids: List of content IDs to process
+        
+        Returns:
+            List of updated Content instances
+        """
+        results = []
+        for content_id in content_ids:
+            try:
+                result = self.process(content_id)
+                results.append(result)
+            except Exception as e:
+                logger.error(f"Error processing content {content_id}: {str(e)}", exc_info=True)
+                # Continue with other items
+                continue
+        
+        return results
+
+