ebef8c225e
Le foreach artIp30 utilisait $ips comme variable de boucle, écrasant le tableau de comptage des requêtes par IP. Résultat : ips=['66.249…':true] au lieu des vrais top 200 IPs. Renommé en $_artIpSet. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
384 lines
13 KiB
PHP
384 lines
13 KiB
PHP
<?php
|
|
|
|
declare(strict_types=1);
|
|
|
|
class AccessLogParser
|
|
{
|
|
private string $logDir;
|
|
private string $pattern;
|
|
private string $cacheFile;
|
|
private int $cacheTtl;
|
|
private int $days;
|
|
/** @var list<string> */
|
|
private array $botPatterns;
|
|
|
|
/** @var array<string,array<string,true>> */
|
|
private array $artIp7 = [];
|
|
/** @var array<string,array<string,true>> */
|
|
private array $artIp14 = [];
|
|
/** @var array<string,array<string,true>> */
|
|
private array $artIp30 = [];
|
|
|
|
private static ?array $memo = null;
|
|
|
|
// Apache COMBINED : IP - - [timestamp] "METHOD /path HTTP/x" STATUS bytes "ref" "ua"
|
|
private const RE = '/^(\S+) \S+ \S+ \[(\d{2}\/\w+\/\d{4}:\d{2}:\d{2}:\d{2} [+-]\d{4})\] "[A-Z-]+ ([^\s"?]+)[^"]*" (\d{3}) \S+ "[^"]*" "([^"]*)"/u';
|
|
|
|
/**
|
|
* @param list<string> $botPatterns
|
|
*/
|
|
public function __construct(
|
|
string $logDir = '/var/log/apache2',
|
|
string $pattern = '*-access.log',
|
|
string $cacheFile = '',
|
|
int $cacheTtl = 600,
|
|
int $days = 30,
|
|
array $botPatterns = []
|
|
) {
|
|
$this->logDir = rtrim($logDir, '/');
|
|
$this->pattern = $pattern;
|
|
$this->cacheFile = $cacheFile !== '' ? $cacheFile : dirname(__DIR__) . '/_cache/access_stats.json';
|
|
$this->cacheTtl = $cacheTtl;
|
|
$this->days = $days;
|
|
$this->botPatterns = array_map('strtolower', $botPatterns);
|
|
}
|
|
|
|
/**
|
|
* @return array{
|
|
* pages:array<string,int>,
|
|
* books:array<string,int>,
|
|
* ips:array<string,int>,
|
|
* pages_by_day:array<string,list<int>>,
|
|
* ips_by_day:array<string,list<int>>,
|
|
* ip_top_paths:array<string,array<string,array{n:int,ts:int}>>,
|
|
* ip_agents:array<string,list<string>>,
|
|
* all_uas:array<string,int>,
|
|
* unique_visitors:array<int,int>,
|
|
* article_unique_visitors:array<string,array<int,int>>
|
|
* }
|
|
*/
|
|
public function stats(): array
|
|
{
|
|
if (self::$memo !== null) {
|
|
return self::$memo;
|
|
}
|
|
if ($this->cacheValid()) {
|
|
$d = json_decode((string) file_get_contents($this->cacheFile), true);
|
|
if (is_array($d)) {
|
|
return self::$memo = $d;
|
|
}
|
|
}
|
|
|
|
$cutoff = strtotime("-{$this->days} days midnight") ?: (time() - $this->days * 86400);
|
|
$pages = [];
|
|
$books = [];
|
|
$ips = []; // requêtes publiques non-bot (tous chemins, tous statuts)
|
|
$dayPages = [];
|
|
$ipPaths = []; // chemins /post/ et /book/ avec statut 200 (pour les ts)
|
|
$ipPathTs = [];
|
|
$ipAllPaths = []; // tous chemins, tous statuts, non-bots
|
|
$ipAllDays = []; // tous jours, tous statuts, non-bots
|
|
$ipAgents = []; // user-agents non-bot par IP
|
|
$allUas = []; // tous UAs publics (bots inclus) pour "Agents détectés"
|
|
|
|
foreach ($this->logFiles() as $file) {
|
|
$this->parseFile($file, $cutoff, $pages, $books, $ips, $dayPages, $ipPaths, $ipPathTs, $ipAllPaths, $ipAllDays, $ipAgents, $allUas);
|
|
}
|
|
|
|
arsort($pages);
|
|
arsort($books);
|
|
arsort($ips);
|
|
arsort($allUas);
|
|
|
|
$pagesByDay = [];
|
|
foreach ($dayPages as $path => $byOffset) {
|
|
$arr = array_fill(0, $this->days, 0);
|
|
foreach ($byOffset as $offset => $count) {
|
|
if ($offset >= 0 && $offset < $this->days) {
|
|
$arr[$offset] = $count;
|
|
}
|
|
}
|
|
$pagesByDay[$path] = $arr;
|
|
}
|
|
|
|
// Top 200 IPs non-bot par volume total de requêtes
|
|
$topIpKeys = array_keys(array_slice($ips, 0, 200, true));
|
|
$ipsByDay = [];
|
|
$ipTopPaths = [];
|
|
$ipTopAgents = [];
|
|
foreach ($topIpKeys as $ip) {
|
|
// Sparkline : activité totale par jour
|
|
$arr = array_fill(0, $this->days, 0);
|
|
foreach ($ipAllDays[$ip] ?? [] as $offset => $count) {
|
|
if ($offset >= 0 && $offset < $this->days) {
|
|
$arr[$offset] = $count;
|
|
}
|
|
}
|
|
$ipsByDay[$ip] = $arr;
|
|
|
|
// Top 20 chemins tous types confondus
|
|
$allPaths = $ipAllPaths[$ip] ?? [];
|
|
arsort($allPaths);
|
|
$ipTopPaths[$ip] = [];
|
|
foreach (array_slice($allPaths, 0, 20, true) as $p => $cnt) {
|
|
$ipTopPaths[$ip][$p] = ['n' => $cnt, 'ts' => $ipPathTs[$ip][$p] ?? 0];
|
|
}
|
|
|
|
// Top 5 user-agents
|
|
$agents = $ipAgents[$ip] ?? [];
|
|
arsort($agents);
|
|
$ipTopAgents[$ip] = array_keys(array_slice($agents, 0, 5, true));
|
|
}
|
|
|
|
// Visiteurs uniques par période — calculé sur TOUS les IPs non-bot (pas seulement le top 200)
|
|
$uniqueVisitors = [7 => 0, 14 => 0, 30 => 0];
|
|
$start7 = $this->days - 7;
|
|
$start14 = $this->days - 14;
|
|
foreach ($ipAllDays as $ipDay) {
|
|
$active7 = $active14 = $active30 = false;
|
|
foreach ($ipDay as $offset => $cnt) {
|
|
if ($cnt <= 0) {
|
|
continue;
|
|
}
|
|
$active30 = true;
|
|
if ($offset >= $start14) {
|
|
$active14 = true;
|
|
}
|
|
if ($offset >= $start7) {
|
|
$active7 = true;
|
|
}
|
|
}
|
|
if ($active7) {
|
|
++$uniqueVisitors[7];
|
|
}
|
|
if ($active14) {
|
|
++$uniqueVisitors[14];
|
|
}
|
|
if ($active30) {
|
|
++$uniqueVisitors[30];
|
|
}
|
|
}
|
|
|
|
// Visiteurs uniques par article (IPs publiques non-bot, /post/ statut 200)
|
|
$articleUv = [];
|
|
foreach ($this->artIp30 as $path => $_artIpSet) {
|
|
$articleUv[$path] = [
|
|
'7' => count($this->artIp7[$path] ?? []),
|
|
'14' => count($this->artIp14[$path] ?? []),
|
|
'30' => count($_artIpSet),
|
|
];
|
|
}
|
|
|
|
$result = [
|
|
'pages' => $pages,
|
|
'books' => $books,
|
|
'ips' => $ips,
|
|
'pages_by_day' => $pagesByDay,
|
|
'ips_by_day' => $ipsByDay,
|
|
'ip_top_paths' => $ipTopPaths,
|
|
'ip_agents' => $ipTopAgents,
|
|
'all_uas' => array_slice($allUas, 0, 300, true),
|
|
'unique_visitors' => $uniqueVisitors,
|
|
'article_unique_visitors' => $articleUv,
|
|
];
|
|
@mkdir(dirname($this->cacheFile), 0755, true);
|
|
@file_put_contents($this->cacheFile, json_encode($result, JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES));
|
|
|
|
return self::$memo = $result;
|
|
}
|
|
|
|
public function isReadable(): bool
|
|
{
|
|
return count($this->logFiles()) > 0;
|
|
}
|
|
|
|
private function cacheValid(): bool
|
|
{
|
|
return file_exists($this->cacheFile)
|
|
&& (time() - filemtime($this->cacheFile)) < $this->cacheTtl;
|
|
}
|
|
|
|
private function matchesBot(string $ua): bool
|
|
{
|
|
if ($ua === '' || $this->botPatterns === []) {
|
|
return false;
|
|
}
|
|
$lo = strtolower($ua);
|
|
foreach ($this->botPatterns as $p) {
|
|
if ($p !== '' && str_contains($lo, $p)) {
|
|
return true;
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
/** @return list<array{path:string,type:string}> */
|
|
private function logFiles(): array
|
|
{
|
|
$files = [];
|
|
$cutoff = time() - ($this->days + 1) * 86400;
|
|
|
|
foreach (glob($this->logDir . '/' . $this->pattern) ?: [] as $base) {
|
|
if (str_ends_with($base, '.gz') || preg_match('/\.\d+$/', $base)) {
|
|
continue;
|
|
}
|
|
foreach (array_merge([$base], glob($base . '.*') ?: []) as $path) {
|
|
if ($path !== $base && filemtime($path) < $cutoff) {
|
|
continue;
|
|
}
|
|
if (!is_readable($path)) {
|
|
continue;
|
|
}
|
|
if (str_ends_with($path, '.tar.gz')) {
|
|
$files[] = ['path' => $path, 'type' => 'tgz'];
|
|
} elseif (str_ends_with($path, '.gz')) {
|
|
$files[] = ['path' => $path, 'type' => 'gz'];
|
|
} else {
|
|
$files[] = ['path' => $path, 'type' => 'plain'];
|
|
}
|
|
}
|
|
}
|
|
|
|
return $files;
|
|
}
|
|
|
|
private static function parseTimestamp(string $raw): int
|
|
{
|
|
// "15/May/2026:00:41:01 +0200"
|
|
if (!preg_match('/(\d{2})\/(\w{3})\/(\d{4}):(\d{2}:\d{2}:\d{2}) ([+-]\d{4})/', $raw, $m)) {
|
|
return 0;
|
|
}
|
|
|
|
return (int) strtotime("{$m[1]} {$m[2]} {$m[3]} {$m[4]} {$m[5]}");
|
|
}
|
|
|
|
private function parseLine(
|
|
string $line,
|
|
int $cutoff,
|
|
array &$pages,
|
|
array &$books,
|
|
array &$ips,
|
|
array &$dayPages,
|
|
array &$ipPaths,
|
|
array &$ipPathTs,
|
|
array &$ipAllPaths,
|
|
array &$ipAllDays,
|
|
array &$ipAgents,
|
|
array &$allUas
|
|
): void {
|
|
if (!preg_match(self::RE, $line, $m)) {
|
|
return;
|
|
}
|
|
[, $ip, $ts, $path, $status, $ua] = $m;
|
|
|
|
$tsVal = self::parseTimestamp($ts);
|
|
if ($tsVal < $cutoff) {
|
|
return;
|
|
}
|
|
|
|
$publicIp = filter_var($ip, FILTER_VALIDATE_IP, FILTER_FLAG_NO_PRIV_RANGE | FILTER_FLAG_NO_RES_RANGE) !== false;
|
|
$dayOffset = (int) floor(($tsVal - $cutoff) / 86400);
|
|
$isBot = $this->matchesBot($ua);
|
|
|
|
// Tous les UAs publics pour la section "Agents détectés" (bots inclus)
|
|
if ($publicIp && $ua !== '') {
|
|
$allUas[$ua] = ($allUas[$ua] ?? 0) + 1;
|
|
}
|
|
|
|
// Requêtes publiques non-bot : comptage visiteurs, chemins, jours, agents
|
|
if ($publicIp && !$isBot) {
|
|
$ips[$ip] = ($ips[$ip] ?? 0) + 1;
|
|
$ipAllPaths[$ip][$path] = ($ipAllPaths[$ip][$path] ?? 0) + 1;
|
|
$ipAllDays[$ip][$dayOffset] = ($ipAllDays[$ip][$dayOffset] ?? 0) + 1;
|
|
if ($ua !== '') {
|
|
$ipAgents[$ip][$ua] = ($ipAgents[$ip][$ua] ?? 0) + 1;
|
|
}
|
|
}
|
|
|
|
// Comptage spécifique aux pages de contenu (statut 200, non-bot)
|
|
if ($status !== '200' || $isBot) {
|
|
return;
|
|
}
|
|
|
|
if (str_starts_with($path, '/post/') && strlen($path) > 6) {
|
|
$pages[$path] = ($pages[$path] ?? 0) + 1;
|
|
$dayPages[$path][$dayOffset] = ($dayPages[$path][$dayOffset] ?? 0) + 1;
|
|
if ($publicIp) {
|
|
$ipPaths[$ip][$path] = ($ipPaths[$ip][$path] ?? 0) + 1;
|
|
if ($tsVal > ($ipPathTs[$ip][$path] ?? 0)) {
|
|
$ipPathTs[$ip][$path] = $tsVal;
|
|
}
|
|
// Visiteurs uniques par article (IPs publiques non-bot uniquement)
|
|
$this->artIp30[$path][$ip] = true;
|
|
if ($dayOffset >= $this->days - 14) {
|
|
$this->artIp14[$path][$ip] = true;
|
|
}
|
|
if ($dayOffset >= $this->days - 7) {
|
|
$this->artIp7[$path][$ip] = true;
|
|
}
|
|
}
|
|
} elseif (str_ends_with($path, '/') === false && str_starts_with($path, '/book/') && strlen($path) > 6) {
|
|
$books[$path] = ($books[$path] ?? 0) + 1;
|
|
if ($publicIp) {
|
|
$ipPaths[$ip][$path] = ($ipPaths[$ip][$path] ?? 0) + 1;
|
|
if ($tsVal > ($ipPathTs[$ip][$path] ?? 0)) {
|
|
$ipPathTs[$ip][$path] = $tsVal;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
private function parseFile(
|
|
array $file,
|
|
int $cutoff,
|
|
array &$pages,
|
|
array &$books,
|
|
array &$ips,
|
|
array &$dayPages,
|
|
array &$ipPaths,
|
|
array &$ipPathTs,
|
|
array &$ipAllPaths,
|
|
array &$ipAllDays,
|
|
array &$ipAgents,
|
|
array &$allUas
|
|
): void {
|
|
if ($file['type'] === 'tgz') {
|
|
try {
|
|
$phar = new PharData($file['path']);
|
|
foreach ($phar as $entry) {
|
|
$content = @file_get_contents('phar://' . $file['path'] . '/' . $entry->getFilename());
|
|
if ($content === false) {
|
|
continue;
|
|
}
|
|
foreach (explode("\n", $content) as $line) {
|
|
$this->parseLine($line, $cutoff, $pages, $books, $ips, $dayPages, $ipPaths, $ipPathTs, $ipAllPaths, $ipAllDays, $ipAgents, $allUas);
|
|
}
|
|
}
|
|
} catch (\Exception $e) {
|
|
}
|
|
} elseif ($file['type'] === 'gz') {
|
|
$h = @gzopen($file['path'], 'rb');
|
|
if (!$h) {
|
|
return;
|
|
}
|
|
while (!gzeof($h)) {
|
|
$line = gzgets($h, 8192);
|
|
if ($line !== false) {
|
|
$this->parseLine($line, $cutoff, $pages, $books, $ips, $dayPages, $ipPaths, $ipPathTs, $ipAllPaths, $ipAllDays, $ipAgents, $allUas);
|
|
}
|
|
}
|
|
gzclose($h);
|
|
} else {
|
|
$h = @fopen($file['path'], 'rb');
|
|
if (!$h) {
|
|
return;
|
|
}
|
|
while (($line = fgets($h)) !== false) {
|
|
$this->parseLine($line, $cutoff, $pages, $books, $ips, $dayPages, $ipPaths, $ipPathTs, $ipAllPaths, $ipAllDays, $ipAgents, $allUas);
|
|
}
|
|
fclose($h);
|
|
}
|
|
}
|
|
}
|