AI AUtomtaion, Schudelign and publishign fromt and backe end refoactr

2026-01-17 15:52:46 +00:00
parent 0435a5cf70
commit d3b3e1c0d4
34 changed files with 4715 additions and 375 deletions
--- a/backend/igny8_core/business/automation/tasks.py
+++ b/backend/igny8_core/business/automation/tasks.py
@@ -49,9 +49,9 @@ def check_scheduled_automations():
                    logger.info(f"[AutomationTask] Skipping site {config.site.id} - already ran today")
                    continue
            
-            # Check if already running
-            if AutomationRun.objects.filter(site=config.site, status='running').exists():
-                logger.info(f"[AutomationTask] Skipping site {config.site.id} - already running")
+            # Check if already running OR paused (don't start new if existing in progress)
+            if AutomationRun.objects.filter(site=config.site, status__in=['running', 'paused']).exists():
+                logger.info(f"[AutomationTask] Skipping site {config.site.id} - automation in progress (running/paused)")
                continue
            
            logger.info(f"[AutomationTask] Starting scheduled automation for site {config.site.id}")
@@ -162,13 +162,50 @@ def run_automation_task(self, run_id: str):
@shared_task(name='automation.resume_automation_task', bind=True, max_retries=0)
 def resume_automation_task(self, run_id: str):
    """
-    Resume paused automation run from current stage
+    Resume paused automation run from current stage.
+    
+    CRITICAL FIXES:
+    - Verifies run status is 'running' before processing
+    - Reacquires lock in case it expired during long pause
+    - Checks pause/cancel status after each stage
+    - Releases lock on failure
    """
    logger.info(f"[AutomationTask] Resuming automation run: {run_id}")
    
    try:
+        from django.core.cache import cache
+        
+        # Load run and verify status
+        run = AutomationRun.objects.get(run_id=run_id)
+        
+        # CRITICAL FIX: Verify run is actually in 'running' status
+        # (status is set to 'running' by views.resume before calling this task)
+        if run.status != 'running':
+            logger.warning(f"[AutomationTask] Run {run_id} status is '{run.status}', not 'running'. Aborting resume.")
+            return
+        
+        # CRITICAL FIX: Reacquire lock in case it expired during long pause (6hr timeout)
+        lock_key = f'automation_lock_{run.site.id}'
+        lock_acquired = cache.add(lock_key, run_id, timeout=21600)  # 6 hours
+        
+        if not lock_acquired:
+            # Lock exists - check if it's ours (from original run start)
+            existing_lock = cache.get(lock_key)
+            # If lock exists but isn't our run_id, another run may have started
+            if existing_lock and existing_lock != run_id and existing_lock != 'locked':
+                logger.warning(f"[AutomationTask] Lock held by different run ({existing_lock}). Aborting resume for {run_id}")
+                run.status = 'failed'
+                run.error_message = f'Lock acquired by another run ({existing_lock}) during pause'
+                run.completed_at = timezone.now()
+                run.save()
+                return
+            # Lock exists and is either 'locked' (our old format) or our run_id - proceed
+            logger.info(f"[AutomationTask] Existing lock found, proceeding with resume")
+        else:
+            # We acquired a new lock (old one expired)
+            logger.info(f"[AutomationTask] Reacquired lock after expiry for run {run_id}")
+        
        service = AutomationService.from_run_id(run_id)
-        run = service.run
        config = service.config
        
        # Continue from current stage
@@ -196,20 +233,35 @@ def resume_automation_task(self, run_id: str):
        for stage in range(run.current_stage - 1, 7):
            if stage_enabled[stage]:
                stage_methods[stage]()
+                
+                # CRITICAL FIX: Check for pause/cancel AFTER each stage (same as run_automation_task)
+                service.run.refresh_from_db()
+                if service.run.status in ['paused', 'cancelled']:
+                    logger.info(f"[AutomationTask] Resumed automation {service.run.status} after stage {stage + 1}")
+                    return
            else:
                logger.info(f"[AutomationTask] Stage {stage + 1} is disabled, skipping")
        
-        logger.info(f"[AutomationTask] Resumed automation run: {run_id}")
+        logger.info(f"[AutomationTask] Resumed automation completed: {run_id}")
        
    except Exception as e:
        logger.error(f"[AutomationTask] Failed to resume automation run {run_id}: {e}")
        
-        # Mark as failed
-        run = AutomationRun.objects.get(run_id=run_id)
-        run.status = 'failed'
-        run.error_message = str(e)
-        run.completed_at = timezone.now()
-        run.save()
+        # Mark as failed and release lock
+        try:
+            run = AutomationRun.objects.get(run_id=run_id)
+            run.status = 'failed'
+            run.error_message = str(e)
+            run.completed_at = timezone.now()
+            run.save()
+            
+            # Release lock on failure
+            from django.core.cache import cache
+            cache.delete(f'automation_lock_{run.site.id}')
+        except Exception as cleanup_err:
+            logger.error(f"[AutomationTask] Failed to cleanup after resume failure: {cleanup_err}")
+        
+        raise


 # Alias for continue_automation_task (same as resume)