fix #29 : envoyer le lien magique par email (envoyer_mail_smtp)

2026-05-13 23:41:58 +02:00
commit 8a85c15372
129 changed files with 22818 additions and 0 deletions
@@ -0,0 +1,276 @@
+<?php
+
+declare(strict_types=1);
+
+/**
+ * Moteur de recherche plein-texte en mémoire.
+ *
+ * Algorithme : scoring multi-champ avec correspondance exacte, sous-chaîne et
+ * similarité trigramme. Logique AND : tous les tokens de la requête doivent
+ * matcher quelque part pour qu'un article soit retourné.
+ *
+ * Score par token :
+ *   1.0  → mot identique         (ex. "Linky" = "Linky")
+ *   0.75 → sous-chaîne           (ex. "voiture" ⊂ "voitures")
+ *   0–0.5 → similarité trigramme  (ex. "linki" ≈ "linky")
+ *
+ * Poids par champ : titre × 6, catégorie × 3, contenu × 1.
+ */
+class SearchEngine
+{
+    private const TITLE_WEIGHT   = 6.0;
+    private const CAT_WEIGHT     = 3.0;
+    private const CONTENT_WEIGHT = 1.0;
+    private const FUZZY_FLOOR    = 0.55; // seuil min. de similarité trigramme
+    private const SNIPPET_LEN    = 220;
+
+    /**
+     * @param array<array> $articles  Liste brute d'articles (depuis ArticleManager)
+     * @return array<array{article: array, score: float, snippet: string}>
+     */
+    public function search(string $query, array $articles): array
+    {
+        $tokens = $this->tokenize($query);
+        if (empty($tokens)) {
+            return [];
+        }
+
+        $results = [];
+        foreach ($articles as $article) {
+            // 'plain' est pré-calculé dans search_index.json, sinon on stripe à la volée
+            $plain   = $article['plain'] ?? $this->stripMarkdown($article['content'] ?? '');
+            $tWords  = $this->tokenize($article['title']    ?? '');
+            $cWords  = $this->tokenize($article['category'] ?? '');
+            $pWords  = $this->tokenize($plain);
+
+            $score = $this->scoreArticle($tokens, $tWords, $cWords, $pWords);
+            if ($score > 0.0) {
+                $results[] = [
+                    'article' => $article,
+                    'score'   => $score,
+                    'snippet' => $this->buildSnippet($plain, $tokens),
+                    'tier'    => $this->determineTier($tokens, $tWords, $cWords, $pWords),
+                ];
+            }
+        }
+
+        usort($results, static function (array $a, array $b): int {
+            if ($a['tier'] !== $b['tier']) {
+                return $a['tier'] <=> $b['tier'];
+            }
+            return $b['score'] <=> $a['score'];
+        });
+        return $results;
+    }
+
+    // ─── Scoring ─────────────────────────────────────────────────────────────
+
+    private function scoreArticle(array $tokens, array $tWords, array $cWords, array $pWords): float
+    {
+        $total = 0.0;
+        foreach ($tokens as $token) {
+            $ts = $this->tokenScore($token, $tWords)   * self::TITLE_WEIGHT
+                + $this->tokenScore($token, $cWords)   * self::CAT_WEIGHT
+                + $this->tokenScore($token, $pWords)   * self::CONTENT_WEIGHT;
+
+            if ($ts <= 0.0) {
+                return 0.0; // AND strict : token introuvable → article exclu
+            }
+            $total += $ts;
+        }
+        return $total;
+    }
+
+    /**
+     * Classe un résultat en tier :
+     *   1 → tous les tokens trouvés exactement dans le titre
+     *   2 → tous les tokens trouvés exactement dans titre, catégorie ou contenu
+     *   3 → au moins un token uniquement en correspondance floue
+     */
+    private function determineTier(array $tokens, array $tWords, array $cWords, array $pWords): int
+    {
+        $inTitle = true;
+        foreach ($tokens as $token) {
+            if ($this->tokenScore($token, $tWords, false) < 0.75) {
+                $inTitle = false;
+                break;
+            }
+        }
+        if ($inTitle) {
+            return 1;
+        }
+
+        $allWords = array_merge($tWords, $cWords, $pWords);
+        foreach ($tokens as $token) {
+            if ($this->tokenScore($token, $allWords, false) < 0.75) {
+                return 3;
+            }
+        }
+        return 2;
+    }
+
+    /**
+     * Retourne un score 0–1 mesurant à quel point $token correspond
+     * au meilleur mot de la liste $words.
+     */
+    private function tokenScore(string $token, array $words, bool $fuzzy = true): float
+    {
+        $best = 0.0;
+        $tLen = mb_strlen($token);
+        foreach ($words as $w) {
+            if ($w === $token) {
+                return 1.0; // exact
+            }
+            if ($tLen >= 3 && (str_contains($w, $token) || str_contains($token, $w))) {
+                $best = max($best, 0.75); // sous-chaîne (pluriels, conjugaisons)
+            }
+            if ($fuzzy && $tLen >= 4) {
+                $sim = $this->trigramSimilarity($token, $w);
+                if ($sim >= self::FUZZY_FLOOR) {
+                    $best = max($best, $sim * 0.55); // fuzzy (fautes de frappe)
+                }
+            }
+        }
+        return $best;
+    }
+
+    /**
+     * Calcule un score cumulé (OR) pour plusieurs tokens sur un ensemble d'articles.
+     * Tokenise chaque article une seule fois — évite N tokenisations avec N appels à search().
+     * Le fuzzy (trigramme) est désactivé sur le contenu (poids 1.0) pour des raisons de perf.
+     *
+     * @param  string[] $tokens   Mots normalisés (lowercase, sans accents)
+     * @param  array[]  $articles Articles (doivent avoir uuid, title, category, plain|content)
+     * @return array{0: array<string, float>, 1: array<string, array>}
+     */
+    public function scorePool(array $tokens, array $articles): array
+    {
+        if (empty($tokens) || empty($articles)) {
+            return [[], []];
+        }
+
+        $scoreMap   = [];
+        $articleMap = [];
+
+        foreach ($articles as $article) {
+            $plain  = $article['plain'] ?? $this->stripMarkdown($article['content'] ?? '');
+            $tWords = $this->tokenize($article['title']    ?? '');
+            $cWords = $this->tokenize($article['category'] ?? '');
+            $pWords = $this->tokenize($plain);
+
+            $total = 0.0;
+            foreach ($tokens as $token) {
+                $ts = $this->tokenScore($token, $tWords, true)  * self::TITLE_WEIGHT
+                    + $this->tokenScore($token, $cWords, true)  * self::CAT_WEIGHT
+                    + $this->tokenScore($token, $pWords, false) * self::CONTENT_WEIGHT;
+                $total += $ts;
+            }
+
+            if ($total > 0.0) {
+                $uuid              = $article['uuid'];
+                $scoreMap[$uuid]   = $total;
+                $articleMap[$uuid] = $article;
+            }
+        }
+
+        return [$scoreMap, $articleMap];
+    }
+
+    // ─── Trigramme ───────────────────────────────────────────────────────────
+
+    private function trigramSimilarity(string $a, string $b): float
+    {
+        $tA = $this->trigrams($a);
+        $tB = $this->trigrams($b);
+        if (empty($tA) || empty($tB)) {
+            return 0.0;
+        }
+        $common = count(array_intersect($tA, $tB));
+        return $common / max(count($tA), count($tB));
+    }
+
+    /** @return string[] */
+    private function trigrams(string $s): array
+    {
+        $out = [];
+        $len = mb_strlen($s);
+        for ($i = 0; $i + 2 < $len; $i++) {
+            $out[] = mb_substr($s, $i, 3);
+        }
+        return array_unique($out);
+    }
+
+    // ─── Snippet avec surbrillance ────────────────────────────────────────────
+
+    private function buildSnippet(string $text, array $tokens): string
+    {
+        $norm = $this->normalize($text);
+        $pos  = 0;
+        foreach ($tokens as $token) {
+            $p = mb_strpos($norm, $token);
+            if ($p !== false) {
+                $pos = max(0, $p - 60);
+                break;
+            }
+        }
+
+        $raw = mb_substr($text, $pos, self::SNIPPET_LEN);
+        if ($pos > 0) {
+            $raw = '…' . ltrim($raw);
+        }
+        if ($pos + self::SNIPPET_LEN < mb_strlen($text)) {
+            $raw .= '…';
+        }
+
+        $escaped = htmlspecialchars($raw, ENT_QUOTES | ENT_SUBSTITUTE, 'UTF-8');
+
+        // Surbrillance : on cherche les tokens dans le texte HTML-échappé
+        foreach ($tokens as $token) {
+            $escaped = (string) preg_replace(
+                '/(' . preg_quote(htmlspecialchars($token, ENT_QUOTES, 'UTF-8'), '/') . ')/iu',
+                '<mark>$1</mark>',
+                $escaped
+            );
+        }
+
+        return $escaped;
+    }
+
+    // ─── Helpers texte ────────────────────────────────────────────────────────
+
+    /** Découpe en mots normalisés (min. 2 caractères). */
+    private function tokenize(string $text): array
+    {
+        $norm  = $this->normalize($text);
+        $words = preg_split('/\W+/u', $norm, -1, PREG_SPLIT_NO_EMPTY) ?: [];
+        return array_values(array_filter($words, fn ($w) => mb_strlen($w) >= 2));
+    }
+
+    /** Minuscule + translittération des accents français. */
+    private function normalize(string $text): string
+    {
+        $text = mb_strtolower($text, 'UTF-8');
+        return strtr($text, [
+            'à' => 'a', 'â' => 'a', 'ä' => 'a',
+            'é' => 'e', 'è' => 'e', 'ê' => 'e', 'ë' => 'e',
+            'î' => 'i', 'ï' => 'i',
+            'ô' => 'o', 'ö' => 'o',
+            'ù' => 'u', 'û' => 'u', 'ü' => 'u',
+            'ç' => 'c', 'æ' => 'ae', 'œ' => 'oe', 'ñ' => 'n',
+        ]);
+    }
+
+    /** Retire la syntaxe Markdown pour extraire le texte brut. */
+    private function stripMarkdown(string $md): string
+    {
+        $t = preg_replace('/!\[[^\]]*\]\([^)]+\)/', '', $md)         ?? $md; // images
+        $t = preg_replace('/\[([^\]]+)\]\([^)]+\)/', '$1', $t)       ?? $t;  // liens
+        $t = preg_replace('/```[\s\S]*?```/', '', $t)                 ?? $t;  // blocs code
+        $t = preg_replace('/`[^`]+`/', '', $t)                        ?? $t;  // code inline
+        $t = preg_replace('/^#{1,6}\s*/m', '', $t)                   ?? $t;  // titres
+        $t = preg_replace('/[*_~]{1,3}([^*_~]+)[*_~]{1,3}/', '$1', $t) ?? $t; // gras/italique
+        $t = preg_replace('/^\s*[-*+|>]\s*/m', '', $t)               ?? $t;  // listes, citations, tableaux
+        $t = preg_replace('/\n{2,}/', ' ', $t)                       ?? $t;
+        return trim($t);
+    }
+}