python_for_image_processing_APEER
python_for_image_processing_APEER copied to clipboard
An HOCR output
Sir, inspired by your tutorial and other references I tried with HOCR output, but 3 things is pulling me back from final output.
- Removal of black scattered dust between text
- Extra thickness of font , it needs to be thinned
- HOCR output is not generating from multiple TIFF images, i was able to generate as single page wise. How do you suggest me on this.
`# Python program to extract text from all the images in a folder
storing the text in corresponding files in a different folder
This is for hocr output, but there is error of getting only 1 page
from PIL import Image import pytesseract as pt import os pt.pytesseract.tesseract_cmd = r'C:\Users\admin\AppData\Local\Programs\Tesseract-OCR\tesseract.exe'
def main(): # path for the folder for getting the raw images path ="D:\input"
# path for the folder for getting the output
tempPath ="D:\\output\"
# iterating the images inside the folder
for imageName in os.listdir(path):
inputPath = os.path.join(path, imageName)
img = Image.open(inputPath)
#config =('--oem 3--psm 6')
# applying ocr using pytesseract for python
#custom_config = r'--oem 3 --psm 6' #text=pt.image_to_string(img, config=custom_config)
text = pt.image_to_pdf_or_hocr(img, extension = 'hocr', config = (r'--oem 3 --psm 6'), lang ="eng")
# for removing the .jpg from the imagePath
#imagePath = imagePath[0:-4]
fullTempPath = os.path.join(tempPath, 'time_'+imageName+".hocr")
print(text)
# saving the text for every image in a separate .txt file
file1 = open(fullTempPath, "wb")
file1.write(text)
file1.close()
if name == 'main': main()
`