init: Parser v1 — Lead Generation Engine

Парсер лидов МБ РФ: Яндекс.Карты + HH.ru + обогащение DaData/ЕГРЮЛ/Rusprofile + Streamlit CRM.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Aks
2026-06-09 12:56:06 +03:00
commit f78f35fb3f
33 changed files with 9198 additions and 0 deletions
+329
View File
@@ -0,0 +1,329 @@
"""DB-слой Streamlit-приложения.
Все запросы к leads.db инкапсулированы здесь. UI-код в app.py не делает
SQL напрямую — только через эти функции.
Стандарт: каждая функция сама открывает/закрывает соединение.
Streamlit перезапускает скрипт на каждое действие — глобальный коннект
держать не имеет смысла.
"""
import json
import sqlite3
from datetime import datetime
from pathlib import Path
from typing import Any
import pandas as pd
def _conn(db_path: Path | str) -> sqlite3.Connection:
conn = sqlite3.connect(str(db_path))
conn.row_factory = sqlite3.Row
return conn
# ─── Опции для фильтров (что вообще есть в БД) ──────────────────────
def get_all_sources(db_path) -> list[str]:
conn = _conn(db_path)
rows = conn.execute(
"SELECT DISTINCT source FROM leads WHERE source IS NOT NULL ORDER BY source"
).fetchall()
conn.close()
# У некоторых лидов source может быть 'yandex_maps,hh' (мерж разных источников) — раскладываем
out: set[str] = set()
for r in rows:
for part in (r["source"] or "").split(","):
part = part.strip()
if part:
out.add(part)
return sorted(out)
def get_all_regions(db_path) -> list[str]:
conn = _conn(db_path)
rows = conn.execute(
"SELECT DISTINCT region FROM leads WHERE region IS NOT NULL AND region != '' ORDER BY region"
).fetchall()
conn.close()
return [r["region"] for r in rows]
def get_all_categories(db_path) -> list[str]:
conn = _conn(db_path)
rows = conn.execute(
"SELECT DISTINCT category FROM leads WHERE category IS NOT NULL AND category != '' ORDER BY category"
).fetchall()
conn.close()
return [r["category"] for r in rows]
# ─── Загрузка лидов с фильтрами ──────────────────────────────────────
def get_leads(db_path, filters: dict) -> pd.DataFrame:
"""Получить таблицу лидов с применением фильтров. Возвращает DataFrame.
filters: {
sources, regions, district_search, categories, statuses,
min_score, max_score, name_search
}
Все ключи опциональные.
"""
where: list[str] = []
params: list = []
if filters.get("sources"):
clauses = []
for s in filters["sources"]:
clauses.append("source LIKE ?")
params.append(f"%{s}%")
where.append("(" + " OR ".join(clauses) + ")")
if filters.get("regions"):
placeholders = ", ".join("?" for _ in filters["regions"])
where.append(f"region IN ({placeholders})")
params.extend(filters["regions"])
if filters.get("district_search"):
where.append("district LIKE ?")
params.append(f"%{filters['district_search']}%")
if filters.get("categories"):
placeholders = ", ".join("?" for _ in filters["categories"])
where.append(f"category IN ({placeholders})")
params.extend(filters["categories"])
if filters.get("statuses"):
# 'inbox' совмещаем с 'new' (старые лиды до миграции имели default 'new')
normalized = []
for s in filters["statuses"]:
if s == "inbox":
normalized.append("inbox")
normalized.append("new")
else:
normalized.append(s)
placeholders = ", ".join("?" for _ in normalized)
where.append(f"COALESCE(outreach_status, 'new') IN ({placeholders})")
params.extend(normalized)
# COALESCE(score, 0): лиды со score=NULL (напр. добавленные вручную) иначе
# отсеиваются, т.к. в SQL `NULL >= 0` не истинно. Считаем NULL за 0.
if "min_score" in filters:
where.append("COALESCE(score, 0) >= ?")
params.append(filters["min_score"])
if "max_score" in filters:
where.append("COALESCE(score, 0) <= ?")
params.append(filters["max_score"])
if filters.get("name_search"):
where.append("name LIKE ?")
params.append(f"%{filters['name_search']}%")
# Фильтр «есть боль под продукт»: pain_products хранит JSON {"P4":3.0,...}.
# Матчим по подстроке "P4" (в кавычках, чтобы P1 не ловил P10).
if filters.get("pain_products"):
clauses = []
for p in filters["pain_products"]:
clauses.append("pain_products LIKE ?")
params.append(f'%"{p}"%')
where.append("(" + " OR ".join(clauses) + ")")
where_sql = " AND ".join(where) if where else "1=1"
cols = """
id, name, inn, director_name, phone_primary, email_primary, phones, emails,
website, vk_url, telegram_url, instagram_url, youtube_url,
address, city, region, district, category,
reviews_count, reviews_avg, score, score_breakdown,
pain_products, diagnostic_coverage, band,
outreach_status, comments, last_action, last_reaction, last_touched_at,
source, parsed_at
"""
query = f"""
SELECT {cols}
FROM leads
WHERE {where_sql}
ORDER BY score DESC, id
"""
conn = _conn(db_path)
df = pd.read_sql_query(query, conn, params=params)
conn.close()
# Нормализуем outreach_status: NULL/'new' → 'inbox' для отображения
if "outreach_status" in df.columns:
df["outreach_status"] = df["outreach_status"].fillna("inbox").replace({"new": "inbox"})
return df
# ─── Один лид ────────────────────────────────────────────────────────
def get_lead_detail(db_path, lead_id: int) -> dict | None:
conn = _conn(db_path)
row = conn.execute("SELECT * FROM leads WHERE id = ?", (lead_id,)).fetchone()
conn.close()
if not row:
return None
lead = dict(row)
# Парсим JSON-поля
for f in ("phones", "phones_extra", "emails", "score_breakdown", "pain_products"):
if lead.get(f):
try:
lead[f] = json.loads(lead[f])
except (json.JSONDecodeError, TypeError):
pass
return lead
# ─── История касаний ────────────────────────────────────────────────
def get_outreach_history(db_path, lead_id: int) -> list[dict]:
conn = _conn(db_path)
rows = conn.execute("""
SELECT * FROM outreach_events
WHERE lead_id = ?
ORDER BY COALESCE(sent_at, '0000') DESC, id DESC
""", (lead_id,)).fetchall()
conn.close()
return [dict(r) for r in rows]
# ─── Запись нового касания ──────────────────────────────────────────
def record_touch(
db_path,
lead_id: int,
channel: str,
reaction: str | None = None,
notes: str | None = None,
new_status: str | None = None,
message_text: str | None = None,
) -> int:
"""Записать касание лида.
- Создаёт строку в outreach_events
- Обновляет last_action / last_reaction / last_touched_at у лида
- Опционально меняет outreach_status
Возвращает id новой строки в outreach_events.
"""
now = datetime.now().isoformat(timespec="seconds")
conn = _conn(db_path)
cursor = conn.execute("""
INSERT INTO outreach_events
(lead_id, channel, message_text, sent_at, reaction, notes)
VALUES (?, ?, ?, ?, ?, ?)
""", (lead_id, channel, message_text, now, reaction, notes))
event_id = cursor.lastrowid
updates = ["last_action = ?", "last_reaction = ?", "last_touched_at = ?"]
values: list[Any] = [channel, reaction, now]
if new_status:
updates.append("outreach_status = ?")
values.append(new_status)
values.append(lead_id)
conn.execute(f"UPDATE leads SET {', '.join(updates)} WHERE id = ?", values)
conn.commit()
conn.close()
return event_id
# ─── Обновление полей лида ──────────────────────────────────────────
def update_lead_status(db_path, lead_id: int, status: str) -> None:
conn = _conn(db_path)
conn.execute("UPDATE leads SET outreach_status = ? WHERE id = ?", (status, lead_id))
conn.commit()
conn.close()
def update_lead_comments(db_path, lead_id: int, comments: str) -> None:
conn = _conn(db_path)
conn.execute("UPDATE leads SET comments = ? WHERE id = ?", (comments, lead_id))
conn.commit()
conn.close()
# ─── Метрики для дашборда ───────────────────────────────────────────
def count_inbox(db_path) -> int:
conn = _conn(db_path)
n = conn.execute(
"SELECT COUNT(*) FROM leads WHERE COALESCE(outreach_status, 'new') IN ('inbox', 'new')"
).fetchone()[0]
conn.close()
return n
def count_in_work(db_path) -> int:
conn = _conn(db_path)
n = conn.execute(
"SELECT COUNT(*) FROM leads WHERE outreach_status IN ('in_work', 'triaged')"
).fetchone()[0]
conn.close()
return n
def count_done(db_path) -> int:
conn = _conn(db_path)
n = conn.execute(
"SELECT COUNT(*) FROM leads WHERE outreach_status = 'done'"
).fetchone()[0]
conn.close()
return n
def count_total(db_path) -> int:
conn = _conn(db_path)
n = conn.execute("SELECT COUNT(*) FROM leads").fetchone()[0]
conn.close()
return n
# ─── Ручное добавление / удаление компаний (из CRM) ──────────────────
def add_lead_manual(db_path, data: dict) -> int:
"""Добавить компанию вручную из CRM. Пишет в ту же leads.db.
Использует database._prepare_lead — те же dedup-ключи / нормализация /
has_website / parsed_at, что и у парсера (консистентность).
Возвращает id нового лида.
Бросает ValueError при дубле (UNIQUE inn / phone_dedup_key) — UI покажет.
"""
import database # parser_v1/database.py (PARENT уже в sys.path из app.py)
prepared = database._prepare_lead(data)
if prepared.get("score") is None:
prepared["score"] = 0 # иначе NULL-score лид невидим в таблице (фильтр score)
fields = list(database.WRITABLE_FIELDS) + ["parsed_at"]
if prepared.get("outreach_status"): # не входит в WRITABLE_FIELDS — добавляем явно
fields.append("outreach_status")
cols = ", ".join(fields)
placeholders = ", ".join("?" for _ in fields)
values = [prepared.get(f) for f in fields]
conn = _conn(db_path)
try:
cur = conn.execute(
f"INSERT INTO leads ({cols}) VALUES ({placeholders})", values
)
conn.commit()
return cur.lastrowid
except sqlite3.IntegrityError as e:
conn.rollback()
msg = str(e).lower()
if "inn" in msg:
raise ValueError(f"Компания с таким ИНН уже есть в базе ({data.get('inn')}).") from e
if "phone" in msg:
raise ValueError(f"Компания с таким телефоном уже есть в базе.") from e
raise ValueError(f"Не удалось добавить (дубль): {e}") from e
finally:
conn.close()
def delete_lead(db_path, lead_id: int) -> None:
"""Удалить компанию из CRM + её историю касаний и связи с прогонами."""
conn = _conn(db_path)
conn.execute("DELETE FROM outreach_events WHERE lead_id = ?", (lead_id,))
conn.execute("DELETE FROM lead_in_run WHERE lead_id = ?", (lead_id,))
conn.execute("DELETE FROM leads WHERE id = ?", (lead_id,))
conn.commit()
conn.close()