124 lines
3.1 KiB
Bash
124 lines
3.1 KiB
Bash
|
#!/bin/bash
|
||
|
|
||
|
###############################################################################
|
||
|
#
|
||
|
# Script to recursively search a directory and batch convert all files of a given file type into another file
|
||
|
# type via HandBrake conversion.
|
||
|
#
|
||
|
# To run in your environment set the variables: hbcli - Path to your HandBrakeCLI
|
||
|
#
|
||
|
# source_dir - Starting directory for recursive search
|
||
|
#
|
||
|
# input_file_type - Input file type to search for
|
||
|
#
|
||
|
# output_file_type - Output file type to convert into
|
||
|
#
|
||
|
#
|
||
|
# Change log: 2012-01-08: Initial release. Tested on Mac OS X Lion.
|
||
|
#
|
||
|
###############################################################################
|
||
|
|
||
|
function message {
|
||
|
echo -e $1
|
||
|
echo -e $idLog $(date --iso-8601=seconds) $1 >> $fichierLog
|
||
|
}
|
||
|
|
||
|
clear
|
||
|
|
||
|
fichierLog=~/convertPDF.log
|
||
|
fichiersATraiter=fichiersATraiter.lst
|
||
|
idLog=$$-${uuidgen}
|
||
|
|
||
|
message "==> Préparation de l'environnement..."
|
||
|
|
||
|
date_en_cours=$(date +"%Y%m%d%H%M%S")
|
||
|
source_dir=$(pwd)
|
||
|
input_file_type=pdf
|
||
|
output_file_type="new.pdf"
|
||
|
declare -a mesFichiers
|
||
|
|
||
|
|
||
|
# Construction d'un tableau des noms de fichiers
|
||
|
i=0
|
||
|
|
||
|
|
||
|
find "$source_dir" -type f -iname "*.$input_file_type" > $fichiersATraiter
|
||
|
|
||
|
|
||
|
while read -e aLine
|
||
|
do
|
||
|
message "[NFO] $aLine"
|
||
|
if [ ${#aLine} -ge 2 ]; then
|
||
|
message " a traiter."
|
||
|
mesFichiers[$i]=$aLine
|
||
|
i=$(($i+1))
|
||
|
fi
|
||
|
done < $fichiersATraiter
|
||
|
|
||
|
rm $fichiersATraiter
|
||
|
|
||
|
message "==> Traitement des PDF en cours..."
|
||
|
|
||
|
|
||
|
# Parcours du tableau
|
||
|
nb_element=${#mesFichiers[*]}
|
||
|
message "[NFO] Nombre de fichiers : $nb_element"
|
||
|
|
||
|
for in_file in "${mesFichiers[@]}"
|
||
|
do # Liste tous les éléments du tableau.
|
||
|
message "[NFO] Nom du fichier : \"$in_file\""
|
||
|
|
||
|
if [ ${#in_file} -ge 2 ]; then
|
||
|
|
||
|
message "[NFO] Input \"$in_file\""
|
||
|
|
||
|
# Replace the file type
|
||
|
out_file=$(echo "$in_file"|sed "s/\(.*\.\)$input_file_type/\1$output_file_type/g")
|
||
|
out_file_txt=$(echo "$in_file"|sed "s/\(.*\.\)$input_file_type/\1txt/g")
|
||
|
|
||
|
message " [NFO] Output \"$out_file\""
|
||
|
|
||
|
if [ "$in_file" != "$out_file" ]; then
|
||
|
|
||
|
# explode to jpeg
|
||
|
message "[NFO] PDFTOCAIRO : Création des fichiers JPG \"$in_file\""
|
||
|
pdftocairo "$in_file" -jpeg
|
||
|
|
||
|
message "[NFO] PDFTOTEXT : Extraction du texte dans \"$out_file_txt\""
|
||
|
pdftotext "$in_file" "$out_file_txt"
|
||
|
|
||
|
# OCR
|
||
|
message " [NFO] TESSERACT : Détection par OCR des fichiers JPG"
|
||
|
for i in *.jpg
|
||
|
do
|
||
|
tesseract -l fra "$i" "$i.ocr"
|
||
|
cat "$i.ocr.txt" >> "$out_file.ocr.txt"
|
||
|
rm "$i.ocr.txt"
|
||
|
done
|
||
|
|
||
|
# contact jpeg to pdf
|
||
|
message "[NFO] CONVERT : Concatenation des JPG en PDF dans \"$out_file\""
|
||
|
convert -verbose *.jpg "$out_file"
|
||
|
|
||
|
message "[NFO] RM : Suppresison des JPG"
|
||
|
rm -v *.jpg
|
||
|
|
||
|
out_file=$(echo "$in_file"|sed "s/\(.*\.\)$input_file_type/\195_$output_file_type/g")
|
||
|
out_file_txt=$(echo "$in_file"|sed "s/\(.*\.\)$input_file_type/\1txt/g")
|
||
|
|
||
|
# pdftocairo "$in_file" -jpeg -jpegopt quality=95 convert *.jpg "$out_file" rm *.jpg
|
||
|
|
||
|
message "[NFO] GS : Conversion (2eme méthode) du PDF"
|
||
|
gs -dBATCH -dNOPAUSE -q -sDEVICE=pdfwrite -dPDFSETTINGS=/ebook -sOutputFile="$out_file.pdf" "$in_file"
|
||
|
|
||
|
message "==> Finished "$out_file ""
|
||
|
|
||
|
fi
|
||
|
fi
|
||
|
|
||
|
done
|
||
|
|
||
|
message "==== DONE CONVERTING FILES ===="
|
||
|
|
||
|
exit 0
|