fix #29 : envoyer le lien magique par email (envoyer_mail_smtp)
This commit is contained in:
@@ -0,0 +1,276 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
/**
|
||||
* Moteur de recherche plein-texte en mémoire.
|
||||
*
|
||||
* Algorithme : scoring multi-champ avec correspondance exacte, sous-chaîne et
|
||||
* similarité trigramme. Logique AND : tous les tokens de la requête doivent
|
||||
* matcher quelque part pour qu'un article soit retourné.
|
||||
*
|
||||
* Score par token :
|
||||
* 1.0 → mot identique (ex. "Linky" = "Linky")
|
||||
* 0.75 → sous-chaîne (ex. "voiture" ⊂ "voitures")
|
||||
* 0–0.5 → similarité trigramme (ex. "linki" ≈ "linky")
|
||||
*
|
||||
* Poids par champ : titre × 6, catégorie × 3, contenu × 1.
|
||||
*/
|
||||
class SearchEngine
|
||||
{
|
||||
private const TITLE_WEIGHT = 6.0;
|
||||
private const CAT_WEIGHT = 3.0;
|
||||
private const CONTENT_WEIGHT = 1.0;
|
||||
private const FUZZY_FLOOR = 0.55; // seuil min. de similarité trigramme
|
||||
private const SNIPPET_LEN = 220;
|
||||
|
||||
/**
|
||||
* @param array<array> $articles Liste brute d'articles (depuis ArticleManager)
|
||||
* @return array<array{article: array, score: float, snippet: string}>
|
||||
*/
|
||||
public function search(string $query, array $articles): array
|
||||
{
|
||||
$tokens = $this->tokenize($query);
|
||||
if (empty($tokens)) {
|
||||
return [];
|
||||
}
|
||||
|
||||
$results = [];
|
||||
foreach ($articles as $article) {
|
||||
// 'plain' est pré-calculé dans search_index.json, sinon on stripe à la volée
|
||||
$plain = $article['plain'] ?? $this->stripMarkdown($article['content'] ?? '');
|
||||
$tWords = $this->tokenize($article['title'] ?? '');
|
||||
$cWords = $this->tokenize($article['category'] ?? '');
|
||||
$pWords = $this->tokenize($plain);
|
||||
|
||||
$score = $this->scoreArticle($tokens, $tWords, $cWords, $pWords);
|
||||
if ($score > 0.0) {
|
||||
$results[] = [
|
||||
'article' => $article,
|
||||
'score' => $score,
|
||||
'snippet' => $this->buildSnippet($plain, $tokens),
|
||||
'tier' => $this->determineTier($tokens, $tWords, $cWords, $pWords),
|
||||
];
|
||||
}
|
||||
}
|
||||
|
||||
usort($results, static function (array $a, array $b): int {
|
||||
if ($a['tier'] !== $b['tier']) {
|
||||
return $a['tier'] <=> $b['tier'];
|
||||
}
|
||||
return $b['score'] <=> $a['score'];
|
||||
});
|
||||
return $results;
|
||||
}
|
||||
|
||||
// ─── Scoring ─────────────────────────────────────────────────────────────
|
||||
|
||||
private function scoreArticle(array $tokens, array $tWords, array $cWords, array $pWords): float
|
||||
{
|
||||
$total = 0.0;
|
||||
foreach ($tokens as $token) {
|
||||
$ts = $this->tokenScore($token, $tWords) * self::TITLE_WEIGHT
|
||||
+ $this->tokenScore($token, $cWords) * self::CAT_WEIGHT
|
||||
+ $this->tokenScore($token, $pWords) * self::CONTENT_WEIGHT;
|
||||
|
||||
if ($ts <= 0.0) {
|
||||
return 0.0; // AND strict : token introuvable → article exclu
|
||||
}
|
||||
$total += $ts;
|
||||
}
|
||||
return $total;
|
||||
}
|
||||
|
||||
/**
|
||||
* Classe un résultat en tier :
|
||||
* 1 → tous les tokens trouvés exactement dans le titre
|
||||
* 2 → tous les tokens trouvés exactement dans titre, catégorie ou contenu
|
||||
* 3 → au moins un token uniquement en correspondance floue
|
||||
*/
|
||||
private function determineTier(array $tokens, array $tWords, array $cWords, array $pWords): int
|
||||
{
|
||||
$inTitle = true;
|
||||
foreach ($tokens as $token) {
|
||||
if ($this->tokenScore($token, $tWords, false) < 0.75) {
|
||||
$inTitle = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if ($inTitle) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
$allWords = array_merge($tWords, $cWords, $pWords);
|
||||
foreach ($tokens as $token) {
|
||||
if ($this->tokenScore($token, $allWords, false) < 0.75) {
|
||||
return 3;
|
||||
}
|
||||
}
|
||||
return 2;
|
||||
}
|
||||
|
||||
/**
|
||||
* Retourne un score 0–1 mesurant à quel point $token correspond
|
||||
* au meilleur mot de la liste $words.
|
||||
*/
|
||||
private function tokenScore(string $token, array $words, bool $fuzzy = true): float
|
||||
{
|
||||
$best = 0.0;
|
||||
$tLen = mb_strlen($token);
|
||||
foreach ($words as $w) {
|
||||
if ($w === $token) {
|
||||
return 1.0; // exact
|
||||
}
|
||||
if ($tLen >= 3 && (str_contains($w, $token) || str_contains($token, $w))) {
|
||||
$best = max($best, 0.75); // sous-chaîne (pluriels, conjugaisons)
|
||||
}
|
||||
if ($fuzzy && $tLen >= 4) {
|
||||
$sim = $this->trigramSimilarity($token, $w);
|
||||
if ($sim >= self::FUZZY_FLOOR) {
|
||||
$best = max($best, $sim * 0.55); // fuzzy (fautes de frappe)
|
||||
}
|
||||
}
|
||||
}
|
||||
return $best;
|
||||
}
|
||||
|
||||
/**
|
||||
* Calcule un score cumulé (OR) pour plusieurs tokens sur un ensemble d'articles.
|
||||
* Tokenise chaque article une seule fois — évite N tokenisations avec N appels à search().
|
||||
* Le fuzzy (trigramme) est désactivé sur le contenu (poids 1.0) pour des raisons de perf.
|
||||
*
|
||||
* @param string[] $tokens Mots normalisés (lowercase, sans accents)
|
||||
* @param array[] $articles Articles (doivent avoir uuid, title, category, plain|content)
|
||||
* @return array{0: array<string, float>, 1: array<string, array>}
|
||||
*/
|
||||
public function scorePool(array $tokens, array $articles): array
|
||||
{
|
||||
if (empty($tokens) || empty($articles)) {
|
||||
return [[], []];
|
||||
}
|
||||
|
||||
$scoreMap = [];
|
||||
$articleMap = [];
|
||||
|
||||
foreach ($articles as $article) {
|
||||
$plain = $article['plain'] ?? $this->stripMarkdown($article['content'] ?? '');
|
||||
$tWords = $this->tokenize($article['title'] ?? '');
|
||||
$cWords = $this->tokenize($article['category'] ?? '');
|
||||
$pWords = $this->tokenize($plain);
|
||||
|
||||
$total = 0.0;
|
||||
foreach ($tokens as $token) {
|
||||
$ts = $this->tokenScore($token, $tWords, true) * self::TITLE_WEIGHT
|
||||
+ $this->tokenScore($token, $cWords, true) * self::CAT_WEIGHT
|
||||
+ $this->tokenScore($token, $pWords, false) * self::CONTENT_WEIGHT;
|
||||
$total += $ts;
|
||||
}
|
||||
|
||||
if ($total > 0.0) {
|
||||
$uuid = $article['uuid'];
|
||||
$scoreMap[$uuid] = $total;
|
||||
$articleMap[$uuid] = $article;
|
||||
}
|
||||
}
|
||||
|
||||
return [$scoreMap, $articleMap];
|
||||
}
|
||||
|
||||
// ─── Trigramme ───────────────────────────────────────────────────────────
|
||||
|
||||
private function trigramSimilarity(string $a, string $b): float
|
||||
{
|
||||
$tA = $this->trigrams($a);
|
||||
$tB = $this->trigrams($b);
|
||||
if (empty($tA) || empty($tB)) {
|
||||
return 0.0;
|
||||
}
|
||||
$common = count(array_intersect($tA, $tB));
|
||||
return $common / max(count($tA), count($tB));
|
||||
}
|
||||
|
||||
/** @return string[] */
|
||||
private function trigrams(string $s): array
|
||||
{
|
||||
$out = [];
|
||||
$len = mb_strlen($s);
|
||||
for ($i = 0; $i + 2 < $len; $i++) {
|
||||
$out[] = mb_substr($s, $i, 3);
|
||||
}
|
||||
return array_unique($out);
|
||||
}
|
||||
|
||||
// ─── Snippet avec surbrillance ────────────────────────────────────────────
|
||||
|
||||
private function buildSnippet(string $text, array $tokens): string
|
||||
{
|
||||
$norm = $this->normalize($text);
|
||||
$pos = 0;
|
||||
foreach ($tokens as $token) {
|
||||
$p = mb_strpos($norm, $token);
|
||||
if ($p !== false) {
|
||||
$pos = max(0, $p - 60);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
$raw = mb_substr($text, $pos, self::SNIPPET_LEN);
|
||||
if ($pos > 0) {
|
||||
$raw = '…' . ltrim($raw);
|
||||
}
|
||||
if ($pos + self::SNIPPET_LEN < mb_strlen($text)) {
|
||||
$raw .= '…';
|
||||
}
|
||||
|
||||
$escaped = htmlspecialchars($raw, ENT_QUOTES | ENT_SUBSTITUTE, 'UTF-8');
|
||||
|
||||
// Surbrillance : on cherche les tokens dans le texte HTML-échappé
|
||||
foreach ($tokens as $token) {
|
||||
$escaped = (string) preg_replace(
|
||||
'/(' . preg_quote(htmlspecialchars($token, ENT_QUOTES, 'UTF-8'), '/') . ')/iu',
|
||||
'<mark>$1</mark>',
|
||||
$escaped
|
||||
);
|
||||
}
|
||||
|
||||
return $escaped;
|
||||
}
|
||||
|
||||
// ─── Helpers texte ────────────────────────────────────────────────────────
|
||||
|
||||
/** Découpe en mots normalisés (min. 2 caractères). */
|
||||
private function tokenize(string $text): array
|
||||
{
|
||||
$norm = $this->normalize($text);
|
||||
$words = preg_split('/\W+/u', $norm, -1, PREG_SPLIT_NO_EMPTY) ?: [];
|
||||
return array_values(array_filter($words, fn ($w) => mb_strlen($w) >= 2));
|
||||
}
|
||||
|
||||
/** Minuscule + translittération des accents français. */
|
||||
private function normalize(string $text): string
|
||||
{
|
||||
$text = mb_strtolower($text, 'UTF-8');
|
||||
return strtr($text, [
|
||||
'à' => 'a', 'â' => 'a', 'ä' => 'a',
|
||||
'é' => 'e', 'è' => 'e', 'ê' => 'e', 'ë' => 'e',
|
||||
'î' => 'i', 'ï' => 'i',
|
||||
'ô' => 'o', 'ö' => 'o',
|
||||
'ù' => 'u', 'û' => 'u', 'ü' => 'u',
|
||||
'ç' => 'c', 'æ' => 'ae', 'œ' => 'oe', 'ñ' => 'n',
|
||||
]);
|
||||
}
|
||||
|
||||
/** Retire la syntaxe Markdown pour extraire le texte brut. */
|
||||
private function stripMarkdown(string $md): string
|
||||
{
|
||||
$t = preg_replace('/!\[[^\]]*\]\([^)]+\)/', '', $md) ?? $md; // images
|
||||
$t = preg_replace('/\[([^\]]+)\]\([^)]+\)/', '$1', $t) ?? $t; // liens
|
||||
$t = preg_replace('/```[\s\S]*?```/', '', $t) ?? $t; // blocs code
|
||||
$t = preg_replace('/`[^`]+`/', '', $t) ?? $t; // code inline
|
||||
$t = preg_replace('/^#{1,6}\s*/m', '', $t) ?? $t; // titres
|
||||
$t = preg_replace('/[*_~]{1,3}([^*_~]+)[*_~]{1,3}/', '$1', $t) ?? $t; // gras/italique
|
||||
$t = preg_replace('/^\s*[-*+|>]\s*/m', '', $t) ?? $t; // listes, citations, tableaux
|
||||
$t = preg_replace('/\n{2,}/', ' ', $t) ?? $t;
|
||||
return trim($t);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user