3bb83b3ffd
Balaye tous les fichiers correspondant au pattern (ex: *-access.log) et leurs rotations .gz/.tar.gz. Valeur par défaut : *-access.log. Label renommé en "Pattern des logs d'accès". Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
151 lines
4.7 KiB
PHP
151 lines
4.7 KiB
PHP
<?php
|
|
|
|
declare(strict_types=1);
|
|
|
|
class SearchLogParser
|
|
{
|
|
private string $logDir;
|
|
private string $vhostBase;
|
|
private string $cacheFile;
|
|
private int $cacheTtl;
|
|
|
|
public function __construct(
|
|
string $logDir = '/var/log/apache2',
|
|
string $vhostBase = '*-access.log',
|
|
string $cacheFile = '',
|
|
int $cacheTtl = 600
|
|
) {
|
|
$this->logDir = rtrim($logDir, '/');
|
|
$this->vhostBase = $vhostBase;
|
|
$this->cacheFile = $cacheFile !== ''
|
|
? $cacheFile
|
|
: dirname(__DIR__) . '/_cache/search_terms.json';
|
|
$this->cacheTtl = $cacheTtl;
|
|
}
|
|
|
|
/** @return array<string,int> terme => nombre d'occurrences, trié desc */
|
|
public function topTerms(int $limit = 100): array
|
|
{
|
|
if ($this->cacheValid()) {
|
|
$data = json_decode((string) file_get_contents($this->cacheFile), true);
|
|
if (is_array($data)) {
|
|
return array_slice($data, 0, $limit, true);
|
|
}
|
|
}
|
|
|
|
$counts = [];
|
|
foreach ($this->logFiles() as $file) {
|
|
$this->parseFile($file, $counts);
|
|
}
|
|
arsort($counts);
|
|
|
|
@mkdir(dirname($this->cacheFile), 0755, true);
|
|
file_put_contents($this->cacheFile, json_encode($counts, JSON_UNESCAPED_UNICODE));
|
|
|
|
return array_slice($counts, 0, $limit, true);
|
|
}
|
|
|
|
public function isReadable(): bool
|
|
{
|
|
return count($this->logFiles()) > 0;
|
|
}
|
|
|
|
private function cacheValid(): bool
|
|
{
|
|
return file_exists($this->cacheFile)
|
|
&& (time() - filemtime($this->cacheFile)) < $this->cacheTtl;
|
|
}
|
|
|
|
/** @return list<array{path:string,type:string}> type: plain|gz|tgz */
|
|
private function logFiles(): array
|
|
{
|
|
$pattern = $this->logDir . '/' . $this->vhostBase;
|
|
$files = [];
|
|
|
|
// Fichiers correspondant au pattern de base (courants + rotations incluses si glob)
|
|
$bases = glob($pattern) ?: [];
|
|
// Ajouter aussi les rotations (.N, .N.gz, .N.tar.gz) pour chaque base trouvée
|
|
foreach ($bases as $base) {
|
|
// Exclure les rotations déjà capturées par le pattern glob
|
|
if (str_ends_with($base, '.gz') || preg_match('/\.\d+$/', $base)) {
|
|
continue;
|
|
}
|
|
$candidates = array_merge([$base], glob($base . '.*') ?: []);
|
|
foreach ($candidates as $path) {
|
|
if (!is_readable($path)) {
|
|
continue;
|
|
}
|
|
if (str_ends_with($path, '.tar.gz')) {
|
|
$files[] = ['path' => $path, 'type' => 'tgz'];
|
|
} elseif (str_ends_with($path, '.gz')) {
|
|
$files[] = ['path' => $path, 'type' => 'gz'];
|
|
} else {
|
|
$files[] = ['path' => $path, 'type' => 'plain'];
|
|
}
|
|
}
|
|
}
|
|
|
|
return $files;
|
|
}
|
|
|
|
private function parseFile(array $file, array &$counts): void
|
|
{
|
|
if ($file['type'] === 'tgz') {
|
|
try {
|
|
$phar = new PharData($file['path']);
|
|
foreach ($phar as $entry) {
|
|
$content = @file_get_contents('phar://' . $file['path'] . '/' . $entry->getFilename());
|
|
if ($content === false) {
|
|
continue;
|
|
}
|
|
foreach (explode("\n", $content) as $line) {
|
|
$this->parseLine($line, $counts);
|
|
}
|
|
}
|
|
} catch (\Exception $e) {
|
|
// archive illisible, on ignore
|
|
}
|
|
} elseif ($file['type'] === 'gz') {
|
|
$h = @gzopen($file['path'], 'rb');
|
|
if (!$h) {
|
|
return;
|
|
}
|
|
while (!gzeof($h)) {
|
|
$line = gzgets($h, 8192);
|
|
if ($line !== false) {
|
|
$this->parseLine($line, $counts);
|
|
}
|
|
}
|
|
gzclose($h);
|
|
} else {
|
|
$h = @fopen($file['path'], 'rb');
|
|
if (!$h) {
|
|
return;
|
|
}
|
|
while (($line = fgets($h)) !== false) {
|
|
$this->parseLine($line, $counts);
|
|
}
|
|
fclose($h);
|
|
}
|
|
}
|
|
|
|
private function parseLine(string $line, array &$counts): void
|
|
{
|
|
if (!str_contains($line, 'GET /search?')) {
|
|
return;
|
|
}
|
|
if (!preg_match('/"GET \/search\?([^"]*) HTTP\//', $line, $m)) {
|
|
return;
|
|
}
|
|
|
|
parse_str($m[1], $params);
|
|
$q = trim(urldecode($params['q'] ?? ''));
|
|
|
|
if ($q === '' || mb_strlen($q) > 200) {
|
|
return;
|
|
}
|
|
$q = mb_strtolower($q);
|
|
$counts[$q] = ($counts[$q] ?? 0) + 1;
|
|
}
|
|
}
|