scripts-bash/convertPDF

#!/bin/bash

###############################################################################
#
# Script to recursively search a directory and batch convert all files of a given file type into another file 
# type via HandBrake conversion.
#
# To run in your environment set the variables: hbcli - Path to your HandBrakeCLI
#
#   source_dir - Starting directory for recursive search
#
#   input_file_type - Input file type to search for
#
#   output_file_type - Output file type to convert into
#
#
# Change log: 2012-01-08: Initial release.  Tested on Mac OS X Lion.
#
###############################################################################
clear

fichierLog=convertPDF.log

echo -e "\n"
echo -e "==> Préparation de l'environnement...\n" | tee $fichierLog

date_en_cours=$(date +"%Y%m%d%H%M%S")
source_dir=$(pwd)
input_file_type=pdf
output_file_type="new.pdf"
declare -a mesFichiers


# Construction d'un tableau des noms de fichiers
i=0


find "$source_dir" -type f -iname "*.$input_file_type" > fichiersATraiter.lst


while read -e aLine
do
    echo "[NFO] $aLine">>$fichierLog
    if [ ${#aLine} -ge 2 ]; then
		echo " a traiter.">>$fichierLog
		mesFichiers[$i]=$aLine
		i=$(($i+1))
	fi
done < fichiersATraiter.lst


echo -e "\n"
echo -e "==> Traitement des PDF en cours...\n" | tee -a $fichierLog


# Parcours du tableau
nb_element=${#mesFichiers[*]}
echo -e "[NFO] Nombre de fichiers : $nb_element\n" | tee -a $fichierLog

for in_file in "${mesFichiers[@]}"
do # Liste tous les éléments du tableau.
    echo -e " [NFO] Nom du fichier : \"$in_file\"\n" | tee -a  $fichierLog

	if [ ${#in_file} -ge 2 ]; then

		echo -e "________________________________________________________________________________\n" | tee -a $fichierLog
		echo -e " [NFO] Input \"$in_file\"\n" | tee -a $fichierLog

		# Replace the file type
		out_file=$(echo "$in_file"|sed "s/\(.*\.\)$input_file_type/\1$output_file_type/g")
		out_file_txt=$(echo "$in_file"|sed "s/\(.*\.\)$input_file_type/\1txt/g")

		echo -e " [NFO] Output \"$out_file\"\n" | tee -a $fichierLog

		if [ "$in_file" != "$out_file" ]; then

            echo -e " [NFO] Conversion (1re méthode) du PDF avec pdftocairo et convert \n" | tee -a $fichierLog
			# explode to jpeg
	        echo -e " [NFO] PDFTOCAIRO : Création des fichiers JPG \"$in_file\"\n" | tee -a $fichierLog
	        pdftocairo "$in_file" -jpeg | tee -a $fichierLog

            echo "" | echo -e " [NFO] PDFTOTEXT : Extraction du texte dans \"$out_file_txt\"\n" | tee -a $fichierLog
            pdftotext "$in_file" "$out_file_txt" | tee -a $fichierLog

			# OCR
			echo -e " [NFO] TESSERACT : Détection par OCR des fichiers JPG \n" | tee -a $fichierLog
			for i in *.jpg
			    do
			    echo -e "\n  [NFO] TESSERACT : Fichier image \"$i\"\n" | tee -a $fichierLog
			    tesseract -l fra "$i" "$i.ocr" | tee -a $fichierLog
			    cat "$i.ocr.txt" >> "$out_file.ocr.txt"
			    rm "$i.ocr.txt"
			done

			# contact jpeg to pdf
			echo -e " [NFO] CONVERT : Concatenation des JPG en PDF dans \"$out_file\" \n" | tee -a $fichierLog
			convert -verbose *.jpg "$out_file" | tee -a $fichierLog

			echo -e " [NFO] RM : Suppresison des JPG \n" | tee -a $fichierLog
			rm -v *.jpg | tee -a $fichierLog

			out_file=$(echo "$in_file"|sed "s/\(.*\.\)$input_file_type/\195_$output_file_type/g")
			out_file_txt=$(echo "$in_file"|sed "s/\(.*\.\)$input_file_type/\1txt/g")

#         		echo "" | pdftocairo "$in_file" -jpeg -jpegopt quality=95 convert *.jpg "$out_file" rm *.jpg

            echo -e " [NFO] GS : Conversion (2eme méthode) du PDF \n" | tee -a $fichierLog
            gs -dBATCH -dNOPAUSE -q -sDEVICE=pdfwrite -dPDFSETTINGS=/ebook -sOutputFile="$out_file.pdf" "$in_file"

			echo -e "==> Finished "$out_file "\n\n" | tee -a $fichierLog

		fi
	fi

done

echo -e "\n\n==== DONE CONVERTING FILES ====" | tee -a $fichierLog


exit 0
Upload New File 2022-03-06 07:46:56 +01:00			`#!/bin/bash`

			`###############################################################################`
			`#`
			`# Script to recursively search a directory and batch convert all files of a given file type into another file`
			`# type via HandBrake conversion.`
			`#`
			`# To run in your environment set the variables: hbcli - Path to your HandBrakeCLI`
			`#`
			`# source_dir - Starting directory for recursive search`
			`#`
			`# input_file_type - Input file type to search for`
			`#`
			`# output_file_type - Output file type to convert into`
			`#`
			`#`
			`# Change log: 2012-01-08: Initial release. Tested on Mac OS X Lion.`
			`#`
			`###############################################################################`
			`clear`

			`fichierLog=convertPDF.log`

			`echo -e "\n"`
			`echo -e "==> Préparation de l'environnement...\n" \| tee $fichierLog`

			`date_en_cours=$(date +"%Y%m%d%H%M%S")`
			`source_dir=$(pwd)`
			`input_file_type=pdf`
			`output_file_type="new.pdf"`
			`declare -a mesFichiers`


			`# Construction d'un tableau des noms de fichiers`
			`i=0`


			`find "$source_dir" -type f -iname "*.$input_file_type" > fichiersATraiter.lst`


			`while read -e aLine`
			`do`
			`echo "[NFO] $aLine">>$fichierLog`
			`if [ ${#aLine} -ge 2 ]; then`
			`echo " a traiter.">>$fichierLog`
			`mesFichiers[$i]=$aLine`
			`i=$(($i+1))`
			`fi`
			`done < fichiersATraiter.lst`


			`echo -e "\n"`
			`echo -e "==> Traitement des PDF en cours...\n" \| tee -a $fichierLog`


			`# Parcours du tableau`
			`nb_element=${#mesFichiers[*]}`
			`echo -e "[NFO] Nombre de fichiers : $nb_element\n" \| tee -a $fichierLog`

			`for in_file in "${mesFichiers[@]}"`
			`do # Liste tous les éléments du tableau.`
			`echo -e " [NFO] Nom du fichier : \"$in_file\"\n" \| tee -a $fichierLog`

			`if [ ${#in_file} -ge 2 ]; then`

			`echo -e "________________________________________________________________________________\n" \| tee -a $fichierLog`
			`echo -e " [NFO] Input \"$in_file\"\n" \| tee -a $fichierLog`

			`# Replace the file type`
			`out_file=$(echo "$in_file"\|sed "s/\(.*\.\)$input_file_type/\1$output_file_type/g")`
			`out_file_txt=$(echo "$in_file"\|sed "s/\(.*\.\)$input_file_type/\1txt/g")`

			`echo -e " [NFO] Output \"$out_file\"\n" \| tee -a $fichierLog`

			`if [ "$in_file" != "$out_file" ]; then`

			`echo -e " [NFO] Conversion (1re méthode) du PDF avec pdftocairo et convert \n" \| tee -a $fichierLog`
			`# explode to jpeg`
			`echo -e " [NFO] PDFTOCAIRO : Création des fichiers JPG \"$in_file\"\n" \| tee -a $fichierLog`
			`pdftocairo "$in_file" -jpeg \| tee -a $fichierLog`

			`echo "" \| echo -e " [NFO] PDFTOTEXT : Extraction du texte dans \"$out_file_txt\"\n" \| tee -a $fichierLog`
			`pdftotext "$in_file" "$out_file_txt" \| tee -a $fichierLog`

			`# OCR`
			`echo -e " [NFO] TESSERACT : Détection par OCR des fichiers JPG \n" \| tee -a $fichierLog`
			`for i in *.jpg`
			`do`
			`echo -e "\n [NFO] TESSERACT : Fichier image \"$i\"\n" \| tee -a $fichierLog`
			`tesseract -l fra "$i" "$i.ocr" \| tee -a $fichierLog`
			`cat "$i.ocr.txt" >> "$out_file.ocr.txt"`
			`rm "$i.ocr.txt"`
			`done`

			`# contact jpeg to pdf`
			`echo -e " [NFO] CONVERT : Concatenation des JPG en PDF dans \"$out_file\" \n" \| tee -a $fichierLog`
			`convert -verbose *.jpg "$out_file" \| tee -a $fichierLog`

			`echo -e " [NFO] RM : Suppresison des JPG \n" \| tee -a $fichierLog`
			`rm -v *.jpg \| tee -a $fichierLog`

			`out_file=$(echo "$in_file"\|sed "s/\(.*\.\)$input_file_type/\195_$output_file_type/g")`
			`out_file_txt=$(echo "$in_file"\|sed "s/\(.*\.\)$input_file_type/\1txt/g")`

			`# echo "" \| pdftocairo "$in_file" -jpeg -jpegopt quality=95 convert .jpg "$out_file" rm .jpg`

			`echo -e " [NFO] GS : Conversion (2eme méthode) du PDF \n" \| tee -a $fichierLog`
			`gs -dBATCH -dNOPAUSE -q -sDEVICE=pdfwrite -dPDFSETTINGS=/ebook -sOutputFile="$out_file.pdf" "$in_file"`

			`echo -e "==> Finished "$out_file "\n\n" \| tee -a $fichierLog`

			`fi`
			`fi`

			`done`

			`echo -e "\n\n==== DONE CONVERTING FILES ====" \| tee -a $fichierLog`




			`exit 0`