feat(auth-capture): full cookie bundle extraction + richer refresh-auth

Problem: Meow upstream uses Cloudflare, which sets cf_clearance + session
cookies that must all be sent together. The old code only captured a single
session-named cookie via a whitelist, discarding cf_clearance entirely, and
wrote back only 'name=value' instead of the full cookie string.

Changes:

auth_capture_service.py:
  - Add _cookie_matches_hostname(): hostname suffix matching supporting
    dot-prefixed domains (.saki.lat matches api.saki.lat)
  - Add _build_cookie_bundle(): collects ALL cookies matching the current
    page's hostname, returns complete 'name1=v1; name2=v2' string
  - _curate_candidates(): new 'cookie_bundle' candidate type (type=0 in sort,
    highest priority), carries cookie_count + cookie_names in extra fields
  - extract_all(): obtain real-time page URL from session.page.url and pass
    to _curate_candidates so cookie domain filtering is accurate
  - Sort order: cookie_bundle > cookie > bearer_token/api_key > credential
  - Fix bug in original JWT dedup check (was assigning instead of checking)

custom_pages.py:
  - Add logging import + logger
  - _pick_best_candidate(): cookie preferred_auth_type now tries cookie_bundle
    first, then single cookie; bearer/api_key use existing type_map logic
  - RefreshAuthResponse: add optional 'warning' field
  - refresh_auth(): handle ctype='cookie_bundle' same as 'cookie'; always
    write full candidate.value as cookie_string (works for both types)
  - Post-write validation: attempt get_available_groups with new credentials;
    on failure, still commit (lenient mode) but set warning message explaining
    cf_clearance IP-binding as the likely cause; success logs at INFO level

Tests (test_auth_capture.py, 19 cases):
  - _cookie_matches_hostname: exact, dot-prefix subdomain, empty domain,
    different domain, evil-subdomain partial match rejection
  - _build_cookie_bundle: cf_clearance included, cross-domain excluded,
    single cookie, empty value excluded, no cookies
  - _curate_candidates: bundle ranks first, value is full string, bundle
    beats single session cookie, bearer wins when no cookies, empty case,
    cookie_count/cookie_names in extra, session fallback preserved,
    new_api_user propagation to bundle

All 46 tests pass.
This commit is contained in:
liumangmang
2026-06-02 09:32:23 +08:00
parent f17317b13c
commit 4f9acdc99c
3 changed files with 352 additions and 17 deletions
+84 -5
View File
@@ -4,6 +4,7 @@ from __future__ import annotations
import json
import logging
from typing import Any
from urllib.parse import urlparse
logger = logging.getLogger(__name__)
@@ -110,7 +111,18 @@ async def extract_all(session: Any) -> dict[str, Any]:
session_storage = await extract_session_storage(page)
auth_headers = await extract_request_headers(session)
new_api_user = _find_new_api_user(local_storage, session_storage) or await extract_new_api_user_id(page)
candidates = _curate_candidates(cookies, local_storage, session_storage, auth_headers, new_api_user)
# 获取当前浏览器页面的真实 URL(比 session.url 更准确)
page_url = ""
try:
page_url = page.url or ""
except Exception:
pass
candidates = _curate_candidates(
cookies, local_storage, session_storage, auth_headers, new_api_user,
page_url=page_url,
)
return {
"cookies": cookies,
@@ -121,17 +133,83 @@ async def extract_all(session: Any) -> dict[str, Any]:
}
def _cookie_matches_hostname(cookie_domain: str, hostname: str) -> bool:
"""判断 cookie domain 是否适用于给定 hostname。
支持带点前缀的 domain(如 `.saki.lat` 匹配 `api.saki.lat`)。
"""
if not cookie_domain or not hostname:
return True # 无 domain 限制时视为全域
# 去掉前缀点
domain = cookie_domain.lstrip(".")
return hostname == domain or hostname.endswith("." + domain)
def _build_cookie_bundle(
cookies: list[dict[str, Any]],
page_url: str,
) -> tuple[str, list[str]]:
"""按 page_url 的 hostname 过滤并组装完整 cookie 字符串。
返回 (cookie_string, cookie_names_list)。
cookie_string 格式:name1=value1; name2=value2; ...
过滤掉空值 cookie。
"""
hostname = ""
if page_url:
try:
hostname = urlparse(page_url).hostname or ""
except Exception:
pass
parts: list[str] = []
names: list[str] = []
for c in cookies:
name = c.get("name", "")
value = c.get("value", "")
domain = c.get("domain", "")
if not name or not value:
continue
if hostname and not _cookie_matches_hostname(domain, hostname):
continue
parts.append(f"{name}={value}")
names.append(name)
return "; ".join(parts), names
def _curate_candidates(
cookies: list[dict[str, Any]],
local_storage: dict[str, str],
session_storage: dict[str, str],
auth_headers: list[dict[str, str]],
new_api_user: str = "",
page_url: str = "",
) -> list[dict[str, Any]]:
"""Scan extracted data for likely credentials with confidence scoring."""
candidates: list[dict[str, Any]] = []
# 1. CDP-captured network headers (highest confidence)
# 0. 完整 Cookie Bundle(最高优先级)
# 按页面 origin 收集所有相关 cookie,包含 cf_clearance 等 Cloudflare cookie
cookie_string, cookie_names = _build_cookie_bundle(cookies, page_url)
if cookie_string:
bundle_extra: dict[str, Any] = {
"cookie_count": len(cookie_names),
"cookie_names": cookie_names,
}
if new_api_user:
bundle_extra["new_api_user"] = new_api_user
_add(
candidates, "cookie_bundle",
f"bundle:{page_url[:60]}",
cookie_string,
f"[{len(cookie_names)} cookies: {', '.join(cookie_names[:5])}{'' if len(cookie_names) > 5 else ''}]",
f"完整 Cookie 组({len(cookie_names)} 个)",
98,
extra=bundle_extra,
)
# 1. CDP-captured network headers (high confidence)
seen = set()
for h in auth_headers:
dedup_key = h["value"]
@@ -166,7 +244,7 @@ def _curate_candidates(
# Looks like a JWT (xx.yy.zz format)
if val.count(".") >= 2 and 20 < len(val) < 5000:
if dedup_key := val not in seen:
if val not in seen:
seen.add(val)
_add(candidates, "bearer_token", f"{store_name}.{key}", val, _preview(val),
f"{store_name}.{key} (JWT)", 80)
@@ -179,7 +257,7 @@ def _curate_candidates(
if not new_api_user:
new_api_user = _find_new_api_user(local_storage, session_storage)
# 3. Session cookies
# 3. 单个 Session cookie(保留,供独立 fallback / bearer 降级使用)
for c in cookies:
cname = c["name"].lower()
if any(k in cname for k in SESSION_COOKIE_NAMES):
@@ -193,8 +271,9 @@ def _curate_candidates(
f"Cookie {c['name']} ({c['domain']})", confidence,
extra=extra)
# 排序:cookie_bundle 最高 → cookie → bearer/api_key → 其他
candidates.sort(key=lambda item: (
0 if item.get("type") == "cookie" and item.get("cookie_name") == "session" else
0 if item.get("type") == "cookie_bundle" else
1 if item.get("type") == "cookie" else
2,
-int(item.get("confidence") or 0),