Friday, August 30, 2019

Convert Pdf to images then extract Text from each Image


Tasks
1) Convert online pdf to images
2) Convert local pdf to images
3) Convert images to bytes and Base64
4) Extract text from each image Arabic/English
5) Save images to local HD

Ubuntu Packages

apt-get install python3

apt install linuxbrew-wrapper
brew install poppler
or
sudo add-apt-repository ppa:opencpu/poppler
sudo apt-get update
sudo apt-get install python-poppler
sudo apt-get install -y poppler-utils

pip3 install pdf2image

if you have problem with pip3 use
sudo apt-get install python3-setuptools
sudo easy_install3 pip

pip3 install pillow
sudo apt install python3-dev libpython3-dev
sudo apt install python3-mysqldb
sudo apt-get install tesseract-ocr
pip3 install pillow pytesseract

Download Arabic support for OCR

sudo apt-get install tesseract-ocr-eng
sudo apt-get install tesseract-ocr-ara

OR

seach for tessdata dir and download ara.traineddata lang inside it
find / -name tessdata
wget https://github.com/tesseract-ocr/tessdata/blob/master/ara.traineddata

Code

from pdf2image import convert_from_path,convert_from_bytes
import requests
def ConvertOnlinePdf2Images(pdfFileURL,outputPath):
    dpi=500
    output_folder=None
    first_page=None
    last_page=None
    fmt='ppm'
    thread_count=20
    userpw=None
    use_cropbox=False
    strict=False
    transparent=False
    single_file=False
    output_file=''
    poppler_path='/usr/local/bin/'

    pdfFileBytes = requests.get(pdfFileURL)
    pages = convert_from_bytes( pdfFileBytes.content , dpi, output_folder, first_page, last_page, fmt, thread_count, userpw, use_cropbox, strict, transparent, single_file, output_file, poppler_path)
    for i in range(len(pages)):
        OCR(pages[i])
        IMG_Base64(IMG_Bytes(pages[i]))
        newfilename = outputPath + str(i) + '.jpeg'
        pages[i].save(newfilename, 'JPEG')


def ConvertPdfFile2Image(PDFfileName):
    dpi=500
    output_folder=None
    first_page=None
    last_page=None
    fmt='ppm'
    thread_count=1
    userpw=None
    use_cropbox=False
    strict=False
    transparent=False
    single_file=False
    output_file=''
    poppler_path='/usr/local/bin/'

    pages = convert_from_path(PDFfileName, dpi, output_folder, first_page, last_page, fmt, thread_count, userpw, use_cropbox, strict, transparent, single_file, output_file, poppler_path)
    for i in range(len(pages)):
        OCR(pages[i])
        IMG_Base64(IMG_Bytes(pages[i]))
        newfilename = PDFfileName[:-4] + str(i) + '.jpeg'
        pages[i].save(newfilename, 'JPEG')



from PIL import Image
import pytesseract
def OCR(img):
    pytesseract.pytesseract.tesseract_cmd = '/usr/local/bin/tesseract'
    text = pytesseract.image_to_string(img,lang='ara+eng')
    print(text)
 

import io
def IMG_Bytes(pil_im):
    b = io.BytesIO()
    pil_im.save(b, 'jpeg')
    im_bytes = b.getvalue()
    return im_bytes
 
import base64
def IMG_Base64(img):
    encoded_string = base64.b64encode(img)
    print(encoded_string)
 



import os
#OCR + IMG Bytes from online pdf
url=r"http://localhost:8888/sg248226.pdf"
ConvertOnlinePdf2Images(url,"/temp")
#OCR + IMG Bytes from local pdf
ConvertPdfFile2Image("/Users/rafie/1.pdf")

No comments: