AI AUtomtaion, Schudelign and publishign fromt and backe end refoactr
This commit is contained in:
@@ -49,9 +49,9 @@ def check_scheduled_automations():
|
||||
logger.info(f"[AutomationTask] Skipping site {config.site.id} - already ran today")
|
||||
continue
|
||||
|
||||
# Check if already running
|
||||
if AutomationRun.objects.filter(site=config.site, status='running').exists():
|
||||
logger.info(f"[AutomationTask] Skipping site {config.site.id} - already running")
|
||||
# Check if already running OR paused (don't start new if existing in progress)
|
||||
if AutomationRun.objects.filter(site=config.site, status__in=['running', 'paused']).exists():
|
||||
logger.info(f"[AutomationTask] Skipping site {config.site.id} - automation in progress (running/paused)")
|
||||
continue
|
||||
|
||||
logger.info(f"[AutomationTask] Starting scheduled automation for site {config.site.id}")
|
||||
@@ -162,13 +162,50 @@ def run_automation_task(self, run_id: str):
|
||||
@shared_task(name='automation.resume_automation_task', bind=True, max_retries=0)
|
||||
def resume_automation_task(self, run_id: str):
|
||||
"""
|
||||
Resume paused automation run from current stage
|
||||
Resume paused automation run from current stage.
|
||||
|
||||
CRITICAL FIXES:
|
||||
- Verifies run status is 'running' before processing
|
||||
- Reacquires lock in case it expired during long pause
|
||||
- Checks pause/cancel status after each stage
|
||||
- Releases lock on failure
|
||||
"""
|
||||
logger.info(f"[AutomationTask] Resuming automation run: {run_id}")
|
||||
|
||||
try:
|
||||
from django.core.cache import cache
|
||||
|
||||
# Load run and verify status
|
||||
run = AutomationRun.objects.get(run_id=run_id)
|
||||
|
||||
# CRITICAL FIX: Verify run is actually in 'running' status
|
||||
# (status is set to 'running' by views.resume before calling this task)
|
||||
if run.status != 'running':
|
||||
logger.warning(f"[AutomationTask] Run {run_id} status is '{run.status}', not 'running'. Aborting resume.")
|
||||
return
|
||||
|
||||
# CRITICAL FIX: Reacquire lock in case it expired during long pause (6hr timeout)
|
||||
lock_key = f'automation_lock_{run.site.id}'
|
||||
lock_acquired = cache.add(lock_key, run_id, timeout=21600) # 6 hours
|
||||
|
||||
if not lock_acquired:
|
||||
# Lock exists - check if it's ours (from original run start)
|
||||
existing_lock = cache.get(lock_key)
|
||||
# If lock exists but isn't our run_id, another run may have started
|
||||
if existing_lock and existing_lock != run_id and existing_lock != 'locked':
|
||||
logger.warning(f"[AutomationTask] Lock held by different run ({existing_lock}). Aborting resume for {run_id}")
|
||||
run.status = 'failed'
|
||||
run.error_message = f'Lock acquired by another run ({existing_lock}) during pause'
|
||||
run.completed_at = timezone.now()
|
||||
run.save()
|
||||
return
|
||||
# Lock exists and is either 'locked' (our old format) or our run_id - proceed
|
||||
logger.info(f"[AutomationTask] Existing lock found, proceeding with resume")
|
||||
else:
|
||||
# We acquired a new lock (old one expired)
|
||||
logger.info(f"[AutomationTask] Reacquired lock after expiry for run {run_id}")
|
||||
|
||||
service = AutomationService.from_run_id(run_id)
|
||||
run = service.run
|
||||
config = service.config
|
||||
|
||||
# Continue from current stage
|
||||
@@ -196,20 +233,35 @@ def resume_automation_task(self, run_id: str):
|
||||
for stage in range(run.current_stage - 1, 7):
|
||||
if stage_enabled[stage]:
|
||||
stage_methods[stage]()
|
||||
|
||||
# CRITICAL FIX: Check for pause/cancel AFTER each stage (same as run_automation_task)
|
||||
service.run.refresh_from_db()
|
||||
if service.run.status in ['paused', 'cancelled']:
|
||||
logger.info(f"[AutomationTask] Resumed automation {service.run.status} after stage {stage + 1}")
|
||||
return
|
||||
else:
|
||||
logger.info(f"[AutomationTask] Stage {stage + 1} is disabled, skipping")
|
||||
|
||||
logger.info(f"[AutomationTask] Resumed automation run: {run_id}")
|
||||
logger.info(f"[AutomationTask] Resumed automation completed: {run_id}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"[AutomationTask] Failed to resume automation run {run_id}: {e}")
|
||||
|
||||
# Mark as failed
|
||||
run = AutomationRun.objects.get(run_id=run_id)
|
||||
run.status = 'failed'
|
||||
run.error_message = str(e)
|
||||
run.completed_at = timezone.now()
|
||||
run.save()
|
||||
# Mark as failed and release lock
|
||||
try:
|
||||
run = AutomationRun.objects.get(run_id=run_id)
|
||||
run.status = 'failed'
|
||||
run.error_message = str(e)
|
||||
run.completed_at = timezone.now()
|
||||
run.save()
|
||||
|
||||
# Release lock on failure
|
||||
from django.core.cache import cache
|
||||
cache.delete(f'automation_lock_{run.site.id}')
|
||||
except Exception as cleanup_err:
|
||||
logger.error(f"[AutomationTask] Failed to cleanup after resume failure: {cleanup_err}")
|
||||
|
||||
raise
|
||||
|
||||
|
||||
# Alias for continue_automation_task (same as resume)
|
||||
|
||||
Reference in New Issue
Block a user