Files
SmartUp/backend/app/services/scheduler.py
T
SmartUp Developer 8a6ed249be fix: complete remaining 8 optimization items
- HTTP connection pooling: UpstreamClient & WebsiteClient reuse httpx.Client
- Deduplicate decimal_string into shared app/utils/number.py
- Split scheduler transaction: snapshot write → webhook/website sync in separate sessions
- Remove hardcoded 170.106.100.210 migration from database.py
- Reset consecutive_failures on upstream update
- Healthcheck: install curl, replace python -c with curl -f
- Add .dockerignore to reduce build context
- Frontend: add axios-retry with exponential backoff (5xx/network errors only)
2026-05-17 11:09:35 +08:00

213 lines
7.2 KiB
Python

"""APScheduler background scheduler for upstream checks."""
from __future__ import annotations
import json
import logging
from datetime import datetime, timezone
from apscheduler.executors.pool import ThreadPoolExecutor
from apscheduler.schedulers.background import BackgroundScheduler
from sqlalchemy.orm import Session
from app.database import SessionLocal
from app.models.upstream import Upstream
from app.models.snapshot import UpstreamRateSnapshot
from app.services.upstream_client import UpstreamClient, UpstreamError, build_snapshot
from app.services.snapshot_service import diff_snapshots, prune_snapshots
from app.services import webhook_service
from app.services import website_sync
from app.config import get_settings
logger = logging.getLogger(__name__)
_scheduler = BackgroundScheduler(timezone="UTC", executors={"default": ThreadPoolExecutor(max_workers=1)})
def get_scheduler() -> BackgroundScheduler:
return _scheduler
def _check_upstream(upstream_id: int) -> None:
"""Full upstream check executed by scheduler (runs in thread).
Phase 1 — upstream API call + snapshot write (single transaction).
Phase 2 — webhook/website sync (separate sessions, so a notification
failure never rolls back the snapshot).
"""
settings = get_settings()
# ── Phase 1: upstream check + DB write ──────────────────────────
db: Session = SessionLocal()
client = None
try:
upstream = db.query(Upstream).filter(Upstream.id == upstream_id).first()
if not upstream or not upstream.enabled:
_remove_job(upstream_id)
return
auth_config = json.loads(upstream.auth_config_json or "{}")
client = UpstreamClient(
base_url=upstream.base_url,
api_prefix=upstream.api_prefix,
auth_type=upstream.auth_type,
auth_config=auth_config,
timeout=float(upstream.timeout_seconds),
)
was_unhealthy = upstream.last_status == "unhealthy"
snapshot = None
changes = None
try:
client.login()
groups = client.get_available_groups(upstream.groups_endpoint)
raw_rates = client.get_group_rates(upstream.rate_endpoint)
snapshot = build_snapshot(
upstream.id, upstream.base_url, upstream.api_prefix, groups, raw_rates
)
except Exception as exc:
# failure path
upstream.consecutive_failures = (upstream.consecutive_failures or 0) + 1
upstream.last_error = str(exc)
upstream.last_checked_at = datetime.now(timezone.utc)
threshold = settings.unhealthy_threshold
became_unhealthy = (
upstream.consecutive_failures >= threshold
and upstream.last_status != "unhealthy"
)
if became_unhealthy:
upstream.last_status = "unhealthy"
db.commit()
logger.warning("upstream %s check failed: %s", upstream.name, exc)
# Phase 2: notify unhealthy in a fresh session
if became_unhealthy:
_notify_status(upstream.id, upstream.name, upstream.base_url,
"upstream_unhealthy", str(exc))
return
# success path
prev_snapshot_row = (
db.query(UpstreamRateSnapshot)
.filter(UpstreamRateSnapshot.upstream_id == upstream_id)
.order_by(UpstreamRateSnapshot.captured_at.desc())
.first()
)
previous = json.loads(prev_snapshot_row.snapshot_json) if prev_snapshot_row else None
changes = diff_snapshots(previous, snapshot)
# save new snapshot
new_row = UpstreamRateSnapshot(
upstream_id=upstream_id,
snapshot_json=json.dumps(snapshot, ensure_ascii=False),
captured_at=datetime.now(timezone.utc),
)
db.add(new_row)
prune_snapshots(db, upstream_id, settings.snapshot_retention_count)
# update upstream status
upstream.last_status = "healthy"
upstream.last_checked_at = datetime.now(timezone.utc)
upstream.last_error = None
upstream.consecutive_failures = 0
db.commit()
logger.info(
"upstream %s: %d rate change(s)" if changes else "upstream %s: no changes",
upstream.name, len(changes) if changes else 0,
)
finally:
client.close()
db.close()
# ── Phase 2: notifications (independent sessions) ──────────────
if was_unhealthy:
_notify_status(upstream_id, upstream.name, upstream.base_url, "upstream_recovered")
if changes:
_notify_rate_changed(upstream_id, upstream.name, upstream.base_url, changes)
_sync_website_bindings(upstream_id, changes)
def _notify_status(
upstream_id: int,
upstream_name: str,
base_url: str,
event: str,
error: str = "",
) -> None:
db = SessionLocal()
try:
webhook_service.send_status_event(db, upstream_id, upstream_name, base_url, event, error)
except Exception:
logger.exception("status webhook failed for upstream %s", upstream_name)
finally:
db.close()
def _notify_rate_changed(
upstream_id: int,
upstream_name: str,
base_url: str,
changes: list[dict[str, Any]],
) -> None:
db = SessionLocal()
try:
webhook_service.send_rate_changed(db, upstream_id, upstream_name, base_url, changes)
except Exception:
logger.exception("rate webhook failed for upstream %s", upstream_name)
finally:
db.close()
def _sync_website_bindings(upstream_id: int, changes: list[dict[str, Any]]) -> None:
db = SessionLocal()
try:
website_sync.sync_affected_bindings(db, upstream_id, changes)
except Exception:
logger.exception("website sync failed for upstream %s", upstream_id)
finally:
db.close()
def _remove_job(upstream_id: int) -> None:
job_id = f"upstream_{upstream_id}"
if _scheduler.get_job(job_id):
_scheduler.remove_job(job_id)
def refresh_upstream(upstream_id: int, interval_seconds: int = 0, enabled: bool = True) -> None:
"""Add/update/remove a scheduler job for the given upstream."""
job_id = f"upstream_{upstream_id}"
if not enabled or interval_seconds <= 0:
_remove_job(upstream_id)
return
_scheduler.add_job(
_check_upstream,
"interval",
seconds=interval_seconds,
id=job_id,
args=[upstream_id],
replace_existing=True,
coalesce=True,
max_instances=1,
)
logger.info("scheduler job %s set to %ds interval", job_id, interval_seconds)
def start_scheduler() -> None:
"""Start scheduler and load all enabled upstreams."""
_scheduler.start()
db: Session = SessionLocal()
try:
upstreams = db.query(Upstream).filter(Upstream.enabled == True).all()
for u in upstreams:
refresh_upstream(u.id, u.check_interval_seconds, u.enabled)
logger.info("scheduler started with %d upstream job(s)", len(upstreams))
finally:
db.close()
def stop_scheduler() -> None:
if _scheduler.running:
_scheduler.shutdown(wait=True)