perf: remote browser CPU / zombie optimization

- Add tini (init: true in compose) to reap orphan Chromium processes
- Reduce screenshot push frequency (active 0.12→0.20s, idle 0.35→1.00s,
  deep idle 1.00→5.00s, backoff 0.60→2.00s)
- Add 5s timeout to screenshot in WebSocket push loop
- close() now wraps context.close() in asyncio.wait_for(10s)
  with browser.close() fallback on timeout
- Two-phase close logging (closing → closed / close_failed)
- Auth-capture sessions evicted after 10min TTL
- shutdown() with timeout protection and logging
- close_ok correctly tracks success through browser fallback path
This commit is contained in:
liumangmang
2026-06-01 15:47:08 +08:00
parent c8ba25f08e
commit a949969c4d
4 changed files with 75 additions and 19 deletions
+4
View File
@@ -21,6 +21,9 @@ ENV PLAYWRIGHT_BROWSERS_PATH=/ms-playwright
RUN sed -i 's|http://deb.debian.org|https://mirrors.aliyun.com|g; s|http://security.debian.org|https://mirrors.aliyun.com|g' /etc/apt/sources.list.d/debian.sources
# Install tini as PID 1 to properly reap orphan Chromium zombie processes
RUN apt-get update && apt-get install -y --no-install-recommends tini && rm -rf /var/lib/apt/lists/*
# 系统依赖层:apt 包安装,缓存 deb 包避免重复下载
RUN --mount=type=cache,target=/var/cache/apt \
apt-get update \
@@ -59,4 +62,5 @@ ENV DATABASE_URL=sqlite:////app/data/app.db
EXPOSE 8000
ENTRYPOINT ["tini", "--"]
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
+12 -7
View File
@@ -265,12 +265,12 @@ async def clear_profile(custom_page_id: int, _=Depends(get_current_user)):
# ——— WebSocket stream ———
# Frame interval & diff detection
_WS_MIN_INTERVAL = 0.10
_WS_IDLE_INTERVAL = 0.35
_WS_ACTIVE_INTERVAL = 0.12
_WS_BACKOFF_INTERVAL = 0.60
_WS_DEEP_IDLE_INTERVAL = 1.00
# Frame interval & diff detection (tuned for CPU efficiency)
_WS_MIN_INTERVAL = 0.15
_WS_IDLE_INTERVAL = 1.00
_WS_ACTIVE_INTERVAL = 0.20
_WS_BACKOFF_INTERVAL = 2.00
_WS_DEEP_IDLE_INTERVAL = 5.00
_WS_ACTIVE_WINDOW = 1.25
@@ -361,7 +361,12 @@ async def session_ws(
state = await browser_sessions.state(session_id)
await websocket.send_json({"type": "state", "session": state})
frame = await browser_sessions.screenshot(session_id)
frame = await asyncio.wait_for(
browser_sessions.screenshot(session_id), timeout=5.0)
except asyncio.TimeoutError:
logger.warning("ws screenshot timeout for %s", session_id[:12])
await asyncio.sleep(interval)
continue
except KeyError:
await websocket.send_json({"error": "session_not_found"})
break
+58 -12
View File
@@ -154,6 +154,7 @@ class BrowserSessionService:
except Exception:
await self.close(session.id)
raise
logger.info("session created: %s (page=%s, profile=%s)", session.id[:12], custom_page_id, profile_key)
return session
def _touch(self, session_id: str) -> None:
@@ -325,6 +326,7 @@ class BrowserSessionService:
session = self._discard_session(session_id)
if not session:
return
logger.info("session closing: %s (page=%s, profile=%s)", session_id[:12], session.custom_page_id, session.profile_key)
# 在完全关闭 context 前,强制将最新的状态落盘保存
if session.profile_key and not session.profile_key.startswith("auth-capture-"):
@@ -340,10 +342,29 @@ class BrowserSessionService:
await session.cdp_session.detach()
except Exception:
pass
close_ok = True
# 关闭 context 带超时,避免永远挂起
try:
await session.context.close()
except Exception:
pass
await asyncio.wait_for(session.context.close(), timeout=10.0)
logger.info("session context closed: %s", session_id[:12])
except asyncio.TimeoutError:
close_ok = False
logger.warning("session close timeout: %s (falling back to browser.close)", session_id[:12])
try:
browser = getattr(session.context, "browser", None)
if browser is not None:
await asyncio.wait_for(browser.close(), timeout=5.0)
close_ok = True
logger.info("session browser fallback closed: %s", session_id[:12])
else:
logger.warning("session context.browser is None, cannot fallback: %s", session_id[:12])
except Exception as exc:
logger.warning("session browser fallback failed: %s: %s", session_id[:12], exc)
except Exception as exc:
close_ok = False
logger.warning("session close error: %s: %s", session_id[:12], exc)
# Clean up ephemeral (auth-capture) profile directories
if session.profile_key and session.profile_key.startswith("auth-capture-"):
profile_dir = self._profile_dir(session.profile_key)
@@ -353,6 +374,11 @@ class BrowserSessionService:
except Exception:
pass
if close_ok:
logger.info("session closed: %s", session_id[:12])
else:
logger.warning("session close_failed: %s", session_id[:12])
async def shutdown(self) -> None:
# Cancel the background eviction loop
@@ -364,10 +390,19 @@ class BrowserSessionService:
pass
self._evict_task = None
sessions = list(self._sessions)
if sessions:
logger.info("shutdown: closing %d browser sessions", len(sessions))
for session_id in sessions:
await self.close(session_id)
try:
await asyncio.wait_for(self.close(session_id), timeout=15.0)
except Exception as exc:
logger.warning("shutdown close failed for %s: %s", session_id[:12], exc)
if self._playwright:
await self._playwright.stop()
logger.info("shutdown: stopping playwright")
try:
await asyncio.wait_for(self._playwright.stop(), timeout=10.0)
except Exception as exc:
logger.warning("shutdown playwright stop failed: %s", exc)
self._playwright = None
async def state(self, session_id: str) -> dict[str, Any]:
@@ -638,18 +673,29 @@ class BrowserSessionService:
logger.exception("idle eviction loop error")
async def _evict_idle_sessions(self) -> None:
"""Close oldest idle sessions when over cap, or any past TTL."""
"""Close oldest idle sessions when over cap, or any past TTL.
- Auth-capture sessions: max 10 minutes lifetime.
- Remote browser sessions: close after IDLE_TTL_SECONDS of no WebSocket activity.
"""
now = asyncio.get_event_loop().time()
# First: drop sessions past idle TTL (excluding just-created ones)
to_remove: list[str] = []
for sid, session in self._sessions.items():
if session.profile_key and session.profile_key.startswith("auth-capture-"):
continue # ephemeral sessions are handled separately
last_active = self._last_event_at.get(sid, 0.0)
if last_active > 0 and (now - last_active) > self.IDLE_TTL_SECONDS:
to_remove.append(sid)
# auth-capture: max 10 minute TTL from creation
created = session.tabs.get(session.active_tab_id)
if created:
age = now - created.created_at
if age > 600:
to_remove.append(sid)
logger.info("evicting auth-capture session %s (age=%ds > 600s)", sid[:12], int(age))
else:
# remote browser sessions: idle TTL
last_active = self._last_event_at.get(sid, 0.0)
if last_active > 0 and (now - last_active) > self.IDLE_TTL_SECONDS:
to_remove.append(sid)
logger.info("evicting idle session %s (no activity for >%ds)", sid[:12], self.IDLE_TTL_SECONDS)
for sid in to_remove:
logger.info("evicting idle session %s (no activity for >%ds)", sid[:12], self.IDLE_TTL_SECONDS)
await self.close(sid)
# Second: if still over cap, evict oldest by last_event_at
+1
View File
@@ -24,6 +24,7 @@ services:
options:
max-size: "10m"
max-file: "3"
init: true
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8000/healthz"]
interval: 30s