Files
parser-v1/app/db_layer.py
T
Aks f78f35fb3f init: Parser v1 — Lead Generation Engine
Парсер лидов МБ РФ: Яндекс.Карты + HH.ru + обогащение DaData/ЕГРЮЛ/Rusprofile + Streamlit CRM.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-06-09 12:56:06 +03:00

330 lines
13 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""DB-слой Streamlit-приложения.
Все запросы к leads.db инкапсулированы здесь. UI-код в app.py не делает
SQL напрямую — только через эти функции.
Стандарт: каждая функция сама открывает/закрывает соединение.
Streamlit перезапускает скрипт на каждое действие — глобальный коннект
держать не имеет смысла.
"""
import json
import sqlite3
from datetime import datetime
from pathlib import Path
from typing import Any
import pandas as pd
def _conn(db_path: Path | str) -> sqlite3.Connection:
conn = sqlite3.connect(str(db_path))
conn.row_factory = sqlite3.Row
return conn
# ─── Опции для фильтров (что вообще есть в БД) ──────────────────────
def get_all_sources(db_path) -> list[str]:
conn = _conn(db_path)
rows = conn.execute(
"SELECT DISTINCT source FROM leads WHERE source IS NOT NULL ORDER BY source"
).fetchall()
conn.close()
# У некоторых лидов source может быть 'yandex_maps,hh' (мерж разных источников) — раскладываем
out: set[str] = set()
for r in rows:
for part in (r["source"] or "").split(","):
part = part.strip()
if part:
out.add(part)
return sorted(out)
def get_all_regions(db_path) -> list[str]:
conn = _conn(db_path)
rows = conn.execute(
"SELECT DISTINCT region FROM leads WHERE region IS NOT NULL AND region != '' ORDER BY region"
).fetchall()
conn.close()
return [r["region"] for r in rows]
def get_all_categories(db_path) -> list[str]:
conn = _conn(db_path)
rows = conn.execute(
"SELECT DISTINCT category FROM leads WHERE category IS NOT NULL AND category != '' ORDER BY category"
).fetchall()
conn.close()
return [r["category"] for r in rows]
# ─── Загрузка лидов с фильтрами ──────────────────────────────────────
def get_leads(db_path, filters: dict) -> pd.DataFrame:
"""Получить таблицу лидов с применением фильтров. Возвращает DataFrame.
filters: {
sources, regions, district_search, categories, statuses,
min_score, max_score, name_search
}
Все ключи опциональные.
"""
where: list[str] = []
params: list = []
if filters.get("sources"):
clauses = []
for s in filters["sources"]:
clauses.append("source LIKE ?")
params.append(f"%{s}%")
where.append("(" + " OR ".join(clauses) + ")")
if filters.get("regions"):
placeholders = ", ".join("?" for _ in filters["regions"])
where.append(f"region IN ({placeholders})")
params.extend(filters["regions"])
if filters.get("district_search"):
where.append("district LIKE ?")
params.append(f"%{filters['district_search']}%")
if filters.get("categories"):
placeholders = ", ".join("?" for _ in filters["categories"])
where.append(f"category IN ({placeholders})")
params.extend(filters["categories"])
if filters.get("statuses"):
# 'inbox' совмещаем с 'new' (старые лиды до миграции имели default 'new')
normalized = []
for s in filters["statuses"]:
if s == "inbox":
normalized.append("inbox")
normalized.append("new")
else:
normalized.append(s)
placeholders = ", ".join("?" for _ in normalized)
where.append(f"COALESCE(outreach_status, 'new') IN ({placeholders})")
params.extend(normalized)
# COALESCE(score, 0): лиды со score=NULL (напр. добавленные вручную) иначе
# отсеиваются, т.к. в SQL `NULL >= 0` не истинно. Считаем NULL за 0.
if "min_score" in filters:
where.append("COALESCE(score, 0) >= ?")
params.append(filters["min_score"])
if "max_score" in filters:
where.append("COALESCE(score, 0) <= ?")
params.append(filters["max_score"])
if filters.get("name_search"):
where.append("name LIKE ?")
params.append(f"%{filters['name_search']}%")
# Фильтр «есть боль под продукт»: pain_products хранит JSON {"P4":3.0,...}.
# Матчим по подстроке "P4" (в кавычках, чтобы P1 не ловил P10).
if filters.get("pain_products"):
clauses = []
for p in filters["pain_products"]:
clauses.append("pain_products LIKE ?")
params.append(f'%"{p}"%')
where.append("(" + " OR ".join(clauses) + ")")
where_sql = " AND ".join(where) if where else "1=1"
cols = """
id, name, inn, director_name, phone_primary, email_primary, phones, emails,
website, vk_url, telegram_url, instagram_url, youtube_url,
address, city, region, district, category,
reviews_count, reviews_avg, score, score_breakdown,
pain_products, diagnostic_coverage, band,
outreach_status, comments, last_action, last_reaction, last_touched_at,
source, parsed_at
"""
query = f"""
SELECT {cols}
FROM leads
WHERE {where_sql}
ORDER BY score DESC, id
"""
conn = _conn(db_path)
df = pd.read_sql_query(query, conn, params=params)
conn.close()
# Нормализуем outreach_status: NULL/'new' → 'inbox' для отображения
if "outreach_status" in df.columns:
df["outreach_status"] = df["outreach_status"].fillna("inbox").replace({"new": "inbox"})
return df
# ─── Один лид ────────────────────────────────────────────────────────
def get_lead_detail(db_path, lead_id: int) -> dict | None:
conn = _conn(db_path)
row = conn.execute("SELECT * FROM leads WHERE id = ?", (lead_id,)).fetchone()
conn.close()
if not row:
return None
lead = dict(row)
# Парсим JSON-поля
for f in ("phones", "phones_extra", "emails", "score_breakdown", "pain_products"):
if lead.get(f):
try:
lead[f] = json.loads(lead[f])
except (json.JSONDecodeError, TypeError):
pass
return lead
# ─── История касаний ────────────────────────────────────────────────
def get_outreach_history(db_path, lead_id: int) -> list[dict]:
conn = _conn(db_path)
rows = conn.execute("""
SELECT * FROM outreach_events
WHERE lead_id = ?
ORDER BY COALESCE(sent_at, '0000') DESC, id DESC
""", (lead_id,)).fetchall()
conn.close()
return [dict(r) for r in rows]
# ─── Запись нового касания ──────────────────────────────────────────
def record_touch(
db_path,
lead_id: int,
channel: str,
reaction: str | None = None,
notes: str | None = None,
new_status: str | None = None,
message_text: str | None = None,
) -> int:
"""Записать касание лида.
- Создаёт строку в outreach_events
- Обновляет last_action / last_reaction / last_touched_at у лида
- Опционально меняет outreach_status
Возвращает id новой строки в outreach_events.
"""
now = datetime.now().isoformat(timespec="seconds")
conn = _conn(db_path)
cursor = conn.execute("""
INSERT INTO outreach_events
(lead_id, channel, message_text, sent_at, reaction, notes)
VALUES (?, ?, ?, ?, ?, ?)
""", (lead_id, channel, message_text, now, reaction, notes))
event_id = cursor.lastrowid
updates = ["last_action = ?", "last_reaction = ?", "last_touched_at = ?"]
values: list[Any] = [channel, reaction, now]
if new_status:
updates.append("outreach_status = ?")
values.append(new_status)
values.append(lead_id)
conn.execute(f"UPDATE leads SET {', '.join(updates)} WHERE id = ?", values)
conn.commit()
conn.close()
return event_id
# ─── Обновление полей лида ──────────────────────────────────────────
def update_lead_status(db_path, lead_id: int, status: str) -> None:
conn = _conn(db_path)
conn.execute("UPDATE leads SET outreach_status = ? WHERE id = ?", (status, lead_id))
conn.commit()
conn.close()
def update_lead_comments(db_path, lead_id: int, comments: str) -> None:
conn = _conn(db_path)
conn.execute("UPDATE leads SET comments = ? WHERE id = ?", (comments, lead_id))
conn.commit()
conn.close()
# ─── Метрики для дашборда ───────────────────────────────────────────
def count_inbox(db_path) -> int:
conn = _conn(db_path)
n = conn.execute(
"SELECT COUNT(*) FROM leads WHERE COALESCE(outreach_status, 'new') IN ('inbox', 'new')"
).fetchone()[0]
conn.close()
return n
def count_in_work(db_path) -> int:
conn = _conn(db_path)
n = conn.execute(
"SELECT COUNT(*) FROM leads WHERE outreach_status IN ('in_work', 'triaged')"
).fetchone()[0]
conn.close()
return n
def count_done(db_path) -> int:
conn = _conn(db_path)
n = conn.execute(
"SELECT COUNT(*) FROM leads WHERE outreach_status = 'done'"
).fetchone()[0]
conn.close()
return n
def count_total(db_path) -> int:
conn = _conn(db_path)
n = conn.execute("SELECT COUNT(*) FROM leads").fetchone()[0]
conn.close()
return n
# ─── Ручное добавление / удаление компаний (из CRM) ──────────────────
def add_lead_manual(db_path, data: dict) -> int:
"""Добавить компанию вручную из CRM. Пишет в ту же leads.db.
Использует database._prepare_lead — те же dedup-ключи / нормализация /
has_website / parsed_at, что и у парсера (консистентность).
Возвращает id нового лида.
Бросает ValueError при дубле (UNIQUE inn / phone_dedup_key) — UI покажет.
"""
import database # parser_v1/database.py (PARENT уже в sys.path из app.py)
prepared = database._prepare_lead(data)
if prepared.get("score") is None:
prepared["score"] = 0 # иначе NULL-score лид невидим в таблице (фильтр score)
fields = list(database.WRITABLE_FIELDS) + ["parsed_at"]
if prepared.get("outreach_status"): # не входит в WRITABLE_FIELDS — добавляем явно
fields.append("outreach_status")
cols = ", ".join(fields)
placeholders = ", ".join("?" for _ in fields)
values = [prepared.get(f) for f in fields]
conn = _conn(db_path)
try:
cur = conn.execute(
f"INSERT INTO leads ({cols}) VALUES ({placeholders})", values
)
conn.commit()
return cur.lastrowid
except sqlite3.IntegrityError as e:
conn.rollback()
msg = str(e).lower()
if "inn" in msg:
raise ValueError(f"Компания с таким ИНН уже есть в базе ({data.get('inn')}).") from e
if "phone" in msg:
raise ValueError(f"Компания с таким телефоном уже есть в базе.") from e
raise ValueError(f"Не удалось добавить (дубль): {e}") from e
finally:
conn.close()
def delete_lead(db_path, lead_id: int) -> None:
"""Удалить компанию из CRM + её историю касаний и связи с прогонами."""
conn = _conn(db_path)
conn.execute("DELETE FROM outreach_events WHERE lead_id = ?", (lead_id,))
conn.execute("DELETE FROM lead_in_run WHERE lead_id = ?", (lead_id,))
conn.execute("DELETE FROM leads WHERE id = ?", (lead_id,))
conn.commit()
conn.close()