$articles Liste brute d'articles (depuis ArticleManager) * @return array */ public function search(string $query, array $articles): array { $tokens = $this->tokenize($query); if (empty($tokens)) { return []; } $results = []; foreach ($articles as $article) { // 'plain' est pré-calculé dans search_index.json, sinon on stripe à la volée $plain = $article['plain'] ?? $this->stripMarkdown($article['content'] ?? ''); $tWords = $this->tokenize($article['title'] ?? ''); $cWords = $this->tokenize($article['category'] ?? ''); $pWords = $this->tokenize($plain); $score = $this->scoreArticle($tokens, $tWords, $cWords, $pWords); if ($score > 0.0) { $results[] = [ 'article' => $article, 'score' => $score, 'snippet' => $this->buildSnippet($plain, $tokens), 'tier' => $this->determineTier($tokens, $tWords, $cWords, $pWords), ]; } } usort($results, static function (array $a, array $b): int { if ($a['tier'] !== $b['tier']) { return $a['tier'] <=> $b['tier']; } return $b['score'] <=> $a['score']; }); return $results; } // ─── Scoring ───────────────────────────────────────────────────────────── private function scoreArticle(array $tokens, array $tWords, array $cWords, array $pWords): float { $total = 0.0; foreach ($tokens as $token) { $ts = $this->tokenScore($token, $tWords) * self::TITLE_WEIGHT + $this->tokenScore($token, $cWords) * self::CAT_WEIGHT + $this->tokenScore($token, $pWords) * self::CONTENT_WEIGHT; if ($ts <= 0.0) { return 0.0; // AND strict : token introuvable → article exclu } $total += $ts; } return $total; } /** * Classe un résultat en tier : * 1 → tous les tokens trouvés exactement dans le titre * 2 → tous les tokens trouvés exactement dans titre, catégorie ou contenu * 3 → au moins un token uniquement en correspondance floue */ private function determineTier(array $tokens, array $tWords, array $cWords, array $pWords): int { $inTitle = true; foreach ($tokens as $token) { if ($this->tokenScore($token, $tWords, false) < 0.75) { $inTitle = false; break; } } if ($inTitle) { return 1; } $allWords = array_merge($tWords, $cWords, $pWords); foreach ($tokens as $token) { if ($this->tokenScore($token, $allWords, false) < 0.75) { return 3; } } return 2; } /** * Retourne un score 0–1 mesurant à quel point $token correspond * au meilleur mot de la liste $words. */ private function tokenScore(string $token, array $words, bool $fuzzy = true): float { $best = 0.0; $tLen = mb_strlen($token); foreach ($words as $w) { if ($w === $token) { return 1.0; // exact } if ($tLen >= 3 && (str_contains($w, $token) || str_contains($token, $w))) { $best = max($best, 0.75); // sous-chaîne (pluriels, conjugaisons) } if ($fuzzy && $tLen >= 4) { $sim = $this->trigramSimilarity($token, $w); if ($sim >= self::FUZZY_FLOOR) { $best = max($best, $sim * 0.55); // fuzzy (fautes de frappe) } } } return $best; } /** * Calcule un score cumulé (OR) pour plusieurs tokens sur un ensemble d'articles. * Tokenise chaque article une seule fois — évite N tokenisations avec N appels à search(). * Le fuzzy (trigramme) est désactivé sur le contenu (poids 1.0) pour des raisons de perf. * * @param string[] $tokens Mots normalisés (lowercase, sans accents) * @param array[] $articles Articles (doivent avoir uuid, title, category, plain|content) * @return array{0: array, 1: array} */ public function scorePool(array $tokens, array $articles): array { if (empty($tokens) || empty($articles)) { return [[], []]; } $scoreMap = []; $articleMap = []; foreach ($articles as $article) { $plain = $article['plain'] ?? $this->stripMarkdown($article['content'] ?? ''); $tWords = $this->tokenize($article['title'] ?? ''); $cWords = $this->tokenize($article['category'] ?? ''); $pWords = $this->tokenize($plain); $total = 0.0; foreach ($tokens as $token) { $ts = $this->tokenScore($token, $tWords, true) * self::TITLE_WEIGHT + $this->tokenScore($token, $cWords, true) * self::CAT_WEIGHT + $this->tokenScore($token, $pWords, false) * self::CONTENT_WEIGHT; $total += $ts; } if ($total > 0.0) { $uuid = $article['uuid']; $scoreMap[$uuid] = $total; $articleMap[$uuid] = $article; } } return [$scoreMap, $articleMap]; } // ─── Trigramme ─────────────────────────────────────────────────────────── private function trigramSimilarity(string $a, string $b): float { $tA = $this->trigrams($a); $tB = $this->trigrams($b); if (empty($tA) || empty($tB)) { return 0.0; } $common = count(array_intersect($tA, $tB)); return $common / max(count($tA), count($tB)); } /** @return string[] */ private function trigrams(string $s): array { $out = []; $len = mb_strlen($s); for ($i = 0; $i + 2 < $len; $i++) { $out[] = mb_substr($s, $i, 3); } return array_unique($out); } // ─── Snippet avec surbrillance ──────────────────────────────────────────── private function buildSnippet(string $text, array $tokens): string { $norm = $this->normalize($text); $pos = 0; foreach ($tokens as $token) { $p = mb_strpos($norm, $token); if ($p !== false) { $pos = max(0, $p - 60); break; } } $raw = mb_substr($text, $pos, self::SNIPPET_LEN); if ($pos > 0) { $raw = '…' . ltrim($raw); } if ($pos + self::SNIPPET_LEN < mb_strlen($text)) { $raw .= '…'; } $escaped = htmlspecialchars($raw, ENT_QUOTES | ENT_SUBSTITUTE, 'UTF-8'); // Surbrillance : on cherche les tokens dans le texte HTML-échappé foreach ($tokens as $token) { $escaped = (string) preg_replace( '/(' . preg_quote(htmlspecialchars($token, ENT_QUOTES, 'UTF-8'), '/') . ')/iu', '$1', $escaped ); } return $escaped; } // ─── Helpers texte ──────────────────────────────────────────────────────── /** Découpe en mots normalisés (min. 2 caractères). */ private function tokenize(string $text): array { $norm = $this->normalize($text); $words = preg_split('/\W+/u', $norm, -1, PREG_SPLIT_NO_EMPTY) ?: []; return array_values(array_filter($words, fn ($w) => mb_strlen($w) >= 2)); } /** Minuscule + translittération des accents français. */ private function normalize(string $text): string { $text = mb_strtolower($text, 'UTF-8'); return strtr($text, [ 'à' => 'a', 'â' => 'a', 'ä' => 'a', 'é' => 'e', 'è' => 'e', 'ê' => 'e', 'ë' => 'e', 'î' => 'i', 'ï' => 'i', 'ô' => 'o', 'ö' => 'o', 'ù' => 'u', 'û' => 'u', 'ü' => 'u', 'ç' => 'c', 'æ' => 'ae', 'œ' => 'oe', 'ñ' => 'n', ]); } /** Retire la syntaxe Markdown pour extraire le texte brut. */ private function stripMarkdown(string $md): string { $t = preg_replace('/!\[[^\]]*\]\([^)]+\)/', '', $md) ?? $md; // images $t = preg_replace('/\[([^\]]+)\]\([^)]+\)/', '$1', $t) ?? $t; // liens $t = preg_replace('/```[\s\S]*?```/', '', $t) ?? $t; // blocs code $t = preg_replace('/`[^`]+`/', '', $t) ?? $t; // code inline $t = preg_replace('/^#{1,6}\s*/m', '', $t) ?? $t; // titres $t = preg_replace('/[*_~]{1,3}([^*_~]+)[*_~]{1,3}/', '$1', $t) ?? $t; // gras/italique $t = preg_replace('/^\s*[-*+|>]\s*/m', '', $t) ?? $t; // listes, citations, tableaux $t = preg_replace('/\n{2,}/', ' ', $t) ?? $t; return trim($t); } }