Add source tracking and sync status fields to Content model; update services module

- Introduced new fields in the Content model for source tracking and sync status, including external references and optimization fields.
- Updated the services module to include new content generation and pipeline services for better organization and clarity.
This commit is contained in:
IGNY8 VPS (Salman)
2025-11-17 11:15:15 +00:00
parent fe95d09bbe
commit 9930728e8a
19 changed files with 2281 additions and 1 deletions

View File

@@ -0,0 +1,5 @@
"""
Linking Services
"""

View File

@@ -0,0 +1,117 @@
"""
Link Candidate Engine
Finds relevant content for internal linking
"""
import logging
from typing import List, Dict
from django.db import models
from igny8_core.business.content.models import Content
logger = logging.getLogger(__name__)
class CandidateEngine:
"""Finds link candidates for content"""
def find_candidates(self, content: Content, max_candidates: int = 10) -> List[Dict]:
"""
Find link candidates for a piece of content.
Args:
content: Content instance to find links for
max_candidates: Maximum number of candidates to return
Returns:
List of candidate dicts with: {'content_id', 'title', 'url', 'relevance_score', 'anchor_text'}
"""
if not content or not content.html_content:
return []
# Find relevant content from same account/site/sector
relevant_content = self._find_relevant_content(content)
# Score candidates based on relevance
candidates = self._score_candidates(content, relevant_content)
# Sort by score and return top candidates
candidates.sort(key=lambda x: x.get('relevance_score', 0), reverse=True)
return candidates[:max_candidates]
def _find_relevant_content(self, content: Content) -> List[Content]:
"""Find relevant content from same account/site/sector"""
# Get content from same account, site, and sector
queryset = Content.objects.filter(
account=content.account,
site=content.site,
sector=content.sector,
status__in=['draft', 'review', 'publish']
).exclude(id=content.id)
# Filter by keywords if available
if content.primary_keyword:
queryset = queryset.filter(
models.Q(primary_keyword__icontains=content.primary_keyword) |
models.Q(secondary_keywords__icontains=content.primary_keyword)
)
return list(queryset[:50]) # Limit initial query
def _score_candidates(self, content: Content, candidates: List[Content]) -> List[Dict]:
"""Score candidates based on relevance"""
scored = []
for candidate in candidates:
score = 0
# Keyword overlap (higher weight)
if content.primary_keyword and candidate.primary_keyword:
if content.primary_keyword.lower() in candidate.primary_keyword.lower():
score += 30
if candidate.primary_keyword.lower() in content.primary_keyword.lower():
score += 30
# Secondary keywords overlap
if content.secondary_keywords and candidate.secondary_keywords:
overlap = set(content.secondary_keywords) & set(candidate.secondary_keywords)
score += len(overlap) * 10
# Category overlap
if content.categories and candidate.categories:
overlap = set(content.categories) & set(candidate.categories)
score += len(overlap) * 5
# Tag overlap
if content.tags and candidate.tags:
overlap = set(content.tags) & set(candidate.tags)
score += len(overlap) * 3
# Recency bonus (newer content gets slight boost)
if candidate.generated_at:
days_old = (content.generated_at - candidate.generated_at).days
if days_old < 30:
score += 5
if score > 0:
scored.append({
'content_id': candidate.id,
'title': candidate.title or candidate.task.title if candidate.task else 'Untitled',
'url': f"/content/{candidate.id}/", # Placeholder - actual URL depends on routing
'relevance_score': score,
'anchor_text': self._generate_anchor_text(candidate, content)
})
return scored
def _generate_anchor_text(self, candidate: Content, source_content: Content) -> str:
"""Generate anchor text for link"""
# Use primary keyword if available, otherwise use title
if candidate.primary_keyword:
return candidate.primary_keyword
elif candidate.title:
return candidate.title
elif candidate.task and candidate.task.title:
return candidate.task.title
else:
return "Learn more"

View File

@@ -0,0 +1,73 @@
"""
Link Injection Engine
Injects internal links into content HTML
"""
import logging
import re
from typing import List, Dict
from igny8_core.business.content.models import Content
logger = logging.getLogger(__name__)
class InjectionEngine:
"""Injects links into content HTML"""
def inject_links(self, content: Content, candidates: List[Dict], max_links: int = 5) -> Dict:
"""
Inject links into content HTML.
Args:
content: Content instance
candidates: List of link candidates from CandidateEngine
max_links: Maximum number of links to inject
Returns:
Dict with: {'html_content', 'links', 'links_added'}
"""
if not content.html_content or not candidates:
return {
'html_content': content.html_content,
'links': [],
'links_added': 0
}
html = content.html_content
links_added = []
links_used = set() # Track which candidates we've used
# Sort candidates by relevance score
sorted_candidates = sorted(candidates, key=lambda x: x.get('relevance_score', 0), reverse=True)
# Inject links (limit to max_links)
for candidate in sorted_candidates[:max_links]:
if candidate['content_id'] in links_used:
continue
anchor_text = candidate.get('anchor_text', 'Learn more')
url = candidate.get('url', f"/content/{candidate['content_id']}/")
# Find first occurrence of anchor text in HTML (case-insensitive)
pattern = re.compile(re.escape(anchor_text), re.IGNORECASE)
match = pattern.search(html)
if match:
# Replace with link
link_html = f'<a href="{url}" class="internal-link">{anchor_text}</a>'
html = html[:match.start()] + link_html + html[match.end():]
links_added.append({
'content_id': candidate['content_id'],
'anchor_text': anchor_text,
'url': url,
'position': match.start()
})
links_used.add(candidate['content_id'])
return {
'html_content': html,
'links': links_added,
'links_added': len(links_added)
}

View File

@@ -0,0 +1,101 @@
"""
Linker Service
Main service for processing content for internal linking
"""
import logging
from typing import List
from igny8_core.business.content.models import Content
from igny8_core.business.linking.services.candidate_engine import CandidateEngine
from igny8_core.business.linking.services.injection_engine import InjectionEngine
from igny8_core.business.billing.services.credit_service import CreditService
from igny8_core.business.billing.exceptions import InsufficientCreditsError
logger = logging.getLogger(__name__)
class LinkerService:
"""Service for processing content for internal linking"""
def __init__(self):
self.candidate_engine = CandidateEngine()
self.injection_engine = InjectionEngine()
self.credit_service = CreditService()
def process(self, content_id: int) -> Content:
"""
Process content for linking.
Args:
content_id: Content ID to process
Returns:
Updated Content instance
Raises:
InsufficientCreditsError: If account doesn't have enough credits
"""
try:
content = Content.objects.get(id=content_id)
except Content.DoesNotExist:
raise ValueError(f"Content with id {content_id} does not exist")
account = content.account
# Check credits
try:
self.credit_service.check_credits(account, 'linking')
except InsufficientCreditsError:
raise
# Find link candidates
candidates = self.candidate_engine.find_candidates(content)
if not candidates:
logger.info(f"No link candidates found for content {content_id}")
return content
# Inject links
result = self.injection_engine.inject_links(content, candidates)
# Update content
content.html_content = result['html_content']
content.internal_links = result['links']
content.linker_version += 1
content.save(update_fields=['html_content', 'internal_links', 'linker_version'])
# Deduct credits
self.credit_service.deduct_credits_for_operation(
account=account,
operation_type='linking',
description=f"Internal linking for content: {content.title or 'Untitled'}",
related_object_type='content',
related_object_id=content.id
)
logger.info(f"Linked content {content_id}: {result['links_added']} links added")
return content
def batch_process(self, content_ids: List[int]) -> List[Content]:
"""
Process multiple content items for linking.
Args:
content_ids: List of content IDs to process
Returns:
List of updated Content instances
"""
results = []
for content_id in content_ids:
try:
result = self.process(content_id)
results.append(result)
except Exception as e:
logger.error(f"Error processing content {content_id}: {str(e)}", exc_info=True)
# Continue with other items
continue
return results