Tasks
1) Convert online pdf to images
2) Convert local pdf to images
3) Convert images to bytes and Base64
4) Extract text from each image Arabic/English
5) Save images to local HD
Ubuntu Packages
apt-get install python3
apt install linuxbrew-wrapper
brew install poppler
or
sudo add-apt-repository ppa:opencpu/poppler
sudo apt-get update
sudo apt-get install python-poppler
sudo apt-get install -y poppler-utils
pip3 install pdf2image
if you have problem with pip3 use
sudo apt-get install python3-setuptools
sudo easy_install3 pip
pip3 install pillow
sudo apt install python3-dev libpython3-dev
sudo apt install python3-mysqldb
sudo apt-get install tesseract-ocr
pip3 install pillow pytesseract
Download Arabic support for OCR
sudo apt-get install tesseract-ocr-eng
sudo apt-get install tesseract-ocr-ara
ORseach for tessdata dir and download ara.traineddata lang inside it
find / -name tessdata
wget https://github.com/tesseract-ocr/tessdata/blob/master/ara.traineddata
Code
from pdf2image import convert_from_path,convert_from_bytes
import requests
def ConvertOnlinePdf2Images(pdfFileURL,outputPath):
dpi=500
output_folder=None
first_page=None
last_page=None
fmt='ppm'
thread_count=20
userpw=None
use_cropbox=False
strict=False
transparent=False
single_file=False
output_file=''
poppler_path='/usr/local/bin/'
pdfFileBytes = requests.get(pdfFileURL)
pages = convert_from_bytes( pdfFileBytes.content , dpi, output_folder, first_page, last_page, fmt, thread_count, userpw, use_cropbox, strict, transparent, single_file, output_file, poppler_path)
for i in range(len(pages)):
OCR(pages[i])
IMG_Base64(IMG_Bytes(pages[i]))
newfilename = outputPath + str(i) + '.jpeg'
pages[i].save(newfilename, 'JPEG')
def ConvertPdfFile2Image(PDFfileName):
dpi=500
output_folder=None
first_page=None
last_page=None
fmt='ppm'
thread_count=1
userpw=None
use_cropbox=False
strict=False
transparent=False
single_file=False
output_file=''
poppler_path='/usr/local/bin/'
pages = convert_from_path(PDFfileName, dpi, output_folder, first_page, last_page, fmt, thread_count, userpw, use_cropbox, strict, transparent, single_file, output_file, poppler_path)
for i in range(len(pages)):
OCR(pages[i])
IMG_Base64(IMG_Bytes(pages[i]))
newfilename = PDFfileName[:-4] + str(i) + '.jpeg'
pages[i].save(newfilename, 'JPEG')
from PIL import Image
import pytesseract
def OCR(img):
pytesseract.pytesseract.tesseract_cmd = '/usr/local/bin/tesseract'
text = pytesseract.image_to_string(img,lang='ara+eng')
print(text)
import io
def IMG_Bytes(pil_im):
b = io.BytesIO()
pil_im.save(b, 'jpeg')
im_bytes = b.getvalue()
return im_bytes
import base64
def IMG_Base64(img):
encoded_string = base64.b64encode(img)
print(encoded_string)
import os
#OCR + IMG Bytes from online pdf
url=r"http://localhost:8888/sg248226.pdf"
ConvertOnlinePdf2Images(url,"/temp")
#OCR + IMG Bytes from local pdf
ConvertPdfFile2Image("/Users/rafie/1.pdf")
No comments:
Post a Comment