Files
folio/src/SearchLogParser.php
T
cedricAbonnel dc4701d667 feat : visiteurs uniques, filtre jours, redirect 404→search, edit_tags (v1.6.16)
- SearchLogParser : visiteurs uniques par terme (IPs distinctes) au lieu de hits bruts (#41)
- SearchLogParser : paramètre $days (7/14), cache distinct par période, filtre logFiles par date (#46)
- admin/searches : boutons 7 j / 14 j, label dynamique, colonne « Visiteurs » (#41, #46)
- URL inconnue / slug absent : redirect 302 /search?q=… au lieu de page 404 (#57)
- edit_tags : masquer abbrev/camel si des valeurs connues existent pour le type (#48)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-16 09:50:56 +02:00

164 lines
5.1 KiB
PHP

<?php
declare(strict_types=1);
class SearchLogParser
{
private string $logDir;
private string $vhostBase;
private string $cacheFile;
private int $cacheTtl;
private int $days;
public function __construct(
string $logDir = '/var/log/apache2',
string $vhostBase = '*-access.log',
string $cacheFile = '',
int $cacheTtl = 600,
int $days = 14
) {
$this->logDir = rtrim($logDir, '/');
$this->vhostBase = $vhostBase;
$this->days = max(1, min(30, $days));
$this->cacheFile = $cacheFile !== ''
? $cacheFile
: dirname(__DIR__) . '/_cache/search_terms_' . $this->days . 'd.json';
$this->cacheTtl = $cacheTtl;
}
/** @return array<string,int> terme => visiteurs uniques, trié desc */
public function topTerms(int $limit = 100): array
{
if ($this->cacheValid()) {
$data = json_decode((string) file_get_contents($this->cacheFile), true);
if (is_array($data)) {
return array_slice($data, 0, $limit, true);
}
}
$visitors = []; // terme => [ip => true]
foreach ($this->logFiles() as $file) {
$this->parseFile($file, $visitors);
}
$counts = [];
foreach ($visitors as $term => $ips) {
$counts[$term] = count($ips);
}
arsort($counts);
@mkdir(dirname($this->cacheFile), 0755, true);
file_put_contents($this->cacheFile, json_encode($counts, JSON_UNESCAPED_UNICODE));
return array_slice($counts, 0, $limit, true);
}
public function isReadable(): bool
{
return count($this->logFiles()) > 0;
}
private function cacheValid(): bool
{
return file_exists($this->cacheFile)
&& (time() - filemtime($this->cacheFile)) < $this->cacheTtl;
}
/** @return list<array{path:string,type:string}> type: plain|gz|tgz */
private function logFiles(): array
{
$pattern = $this->logDir . '/' . $this->vhostBase;
$files = [];
$cutoff = time() - $this->days * 86400;
// Fichiers correspondant au pattern de base (courants + rotations incluses si glob)
$bases = glob($pattern) ?: [];
// Ajouter aussi les rotations (.N, .N.gz, .N.tar.gz) pour chaque base trouvée
foreach ($bases as $base) {
// Exclure les rotations déjà capturées par le pattern glob
if (str_ends_with($base, '.gz') || preg_match('/\.\d+$/', $base)) {
continue;
}
$candidates = array_merge([$base], glob($base . '.*') ?: []);
foreach ($candidates as $path) {
if (!is_readable($path)) {
continue;
}
if (@filemtime($path) < $cutoff) {
continue;
}
if (str_ends_with($path, '.tar.gz')) {
$files[] = ['path' => $path, 'type' => 'tgz'];
} elseif (str_ends_with($path, '.gz')) {
$files[] = ['path' => $path, 'type' => 'gz'];
} else {
$files[] = ['path' => $path, 'type' => 'plain'];
}
}
}
return $files;
}
private function parseFile(array $file, array &$visitors): void
{
if ($file['type'] === 'tgz') {
try {
$phar = new PharData($file['path']);
foreach ($phar as $entry) {
$content = @file_get_contents('phar://' . $file['path'] . '/' . $entry->getFilename());
if ($content === false) {
continue;
}
foreach (explode("\n", $content) as $line) {
$this->parseLine($line, $visitors);
}
}
} catch (\Exception $e) {
// archive illisible, on ignore
}
} elseif ($file['type'] === 'gz') {
$h = @gzopen($file['path'], 'rb');
if (!$h) {
return;
}
while (!gzeof($h)) {
$line = gzgets($h, 8192);
if ($line !== false) {
$this->parseLine($line, $visitors);
}
}
gzclose($h);
} else {
$h = @fopen($file['path'], 'rb');
if (!$h) {
return;
}
while (($line = fgets($h)) !== false) {
$this->parseLine($line, $visitors);
}
fclose($h);
}
}
private function parseLine(string $line, array &$visitors): void
{
if (!str_contains($line, 'GET /search?')) {
return;
}
if (!preg_match('/^(\S+) \S+ \S+ \[[^\]]+\] "GET \/search\?([^"]*) HTTP\//', $line, $m)) {
return;
}
$ip = $m[1];
parse_str($m[2], $params);
$q = trim(urldecode($params['q'] ?? ''));
if ($q === '' || mb_strlen($q) > 200) {
return;
}
$q = mb_strtolower($q);
$visitors[$q][$ip] = true;
}
}