首页
学习
活动
专区
圈层
工具
发布
社区首页 >问答首页 >限制tesseract OCR识别的字符数

限制tesseract OCR识别的字符数
EN

Stack Overflow用户
提问于 2021-03-18 13:19:39
回答 1查看 146关注 0票数 0

The images I'm trying to mask look like this. 我正在使用tesseract作为一个库来迭代多个图像,并识别其中的数字,然后对这些数字进行掩码。我在每个图像中有12个数字,我希望它在第8个数字之后停止识别/掩蔽,也就是说,只有8个数字需要被掩蔽,我尝试通过硬编码来实现这一点,因为它们中的大多数都有点相似,但是稍微偏离的图像会被搞乱。有没有办法阻止tesseract在8位数字之后屏蔽这些图像?我正在使用cv2.rectangle来遮罩图像。

代码语言:javascript
复制
pytesseract.pytesseract.tesseract_cmd = "D:/Tess/tesseract.exe"


for imgfilepathactual in glob.iglob('D:/dataset/allpdf/data/*.jpeg'):
    
    imgfilepath2 = imgfilepathactual.split("/")[3]
    imgfilepath1 = imgfilepath2.split('\\')[1]
    imgfilepath = imgfilepath1.split(".")[0]
    #print(filepath)
    print(imgfilepath)
    

 
            
    img = cv2.imread('D:/dataset/allpdf/data/' + imgfilepath + '.jpeg', cv2.IMREAD_GRAYSCALE) #if using with pdf conv
    #print(str(img))


    sobelX1 = cv2.Sobel(img, cv2.CV_64F, 1, 0, ksize = 1)
    sobelY1 = cv2.Sobel(img, cv2.CV_64F, 0, 1, ksize = 1)

    sobelX1 = np.uint8(np.absolute(sobelX1))
    sobelY1 = np.uint8(np.absolute(sobelY1))

    sobelCombined1 = cv2.bitwise_or(sobelX1, sobelY1)

    blurred = cv2.blur(sobelX1, (3,3)) #for pdf->img


    canny = cv2.Canny(blurred, 5, 250)

    pts1 =np.argwhere(canny>0)
    y11,x11 = pts1.min(axis=0)
    y21,x21 = pts1.max(axis=0)

    cropped = img[y11:y21, x11:x21]
    #cv2.imwrite("cropped.png", cropped)
    resizedimage = cv2.resize(cropped, (1080, 720), interpolation=cv2.INTER_CUBIC) #actual 
    cv2.imwrite('resizedimage' + imgfilepath + '.jpeg', resizedimage)

    img = cv2.imread('resizedimage' + imgfilepath + '.jpeg')
    h1, w1, _= img.shape

   

    resizedimage = Image.open('resizedimage' + imgfilepath + '.jpeg')
    box1 = (0,0, w1, 0.90*h1)

    resizedimage = resizedimage.crop(box1)
    resizedimage.save('resizedimage' + imgfilepath + '.jpeg')

    img = cv2.imread('resizedimage' + imgfilepath + '.jpeg')
    h2, w2, _ = img.shape

    print ((h2, w2), "reso")

    croppedimg2 = Image.open('resizedimage' + imgfilepath + '.jpeg')
    box2 = (0, 0.75*h2, 0.48*w2, h2)

    croppedimg2 = croppedimg2.crop(box2)
    croppedimg2.save('croppedimg2' + imgfilepath + '.jpeg')

    aadharBoxes = pytesseract.image_to_boxes(croppedimg2, lang = "eng")#, config=' --psm 7 -c tessedit_char_whitelist=0123456789abcdefghijklmnopqrstuvwxyz'
    #adharBoxes = aadharBoxes[0:8]
    for b in aadharBoxes.splitlines():
        b = b.split(' ')
        high = []
        for i in range(8):
            b[i] = b[i].split(' ')
            high.append(int(b[i][2]))
            high.append(int(b[i][4]))
        #b = b[0:8]
        print(b)
        #print(len(b[0]))
        maskedImage = cv2.rectangle(img, (int(b[0][1]), np.min(high)), (int(b[7][3]), np.max(high)), (0, 0, 0), -1)
        cv2.imwrite("maskedImage" + imgfilepath + ".jpeg", maskedImage)
        pdf = img2pdf.convert("maskedImage"+ imgfilepath + ".jpeg")
        file = open("D:/dataset/allpdf/masked_files/masked" + imgfilepath + ".pdf", "wb")
        file.write(pdf)
        file.close()
    #else:
        #pdf = img2pdf.convert("unmaskedImage"+ resizedimage + ".jpeg")
        #file = open("D:/dataset/allpdf/masked_files/masked" + imgfilepath + ".pdf", "wb")
        #file.write(pdf)
        #file.close()
        
    os.remove('resizedimage' + imgfilepath + '.jpeg')
    #os.remove('maskedImage' + imgfilepath + '.jpeg')
    #os.remove('croppedimg2' + imgfilepath + '.jpeg')
      


for filepathactual in glob.iglob('D:/dataset/allpdf/*.pdf'):
    #print(filepathactual)
    filepath2 = filepathactual.split("/")[2]
    filepath1 = filepath2.split("\\")[1]
    filepath = filepath1.split(".")[0]
    print(filepath)

    def convertPdf2img():
        pages = convert_from_path(filepathactual, 500)  #converting pdf to img
        for page in pages:
            page.save('out' + filepath + '.jpg', 'JPEG')


    
    convertPdf2img()
    


            
    img = cv2.imread('out' + filepath + '.jpg', cv2.IMREAD_GRAYSCALE) #if using with pdf conv



    sobelX1 = cv2.Sobel(img, cv2.CV_64F, 1, 0, ksize = 1)
    sobelY1 = cv2.Sobel(img, cv2.CV_64F, 0, 1, ksize = 1)

    sobelX1 = np.uint8(np.absolute(sobelX1))
    sobelY1 = np.uint8(np.absolute(sobelY1))

    sobelCombined1 = cv2.bitwise_or(sobelX1, sobelY1)

    blurred = cv2.blur(img, (3,3)) #for pdf->img


    canny = cv2.Canny(blurred, 5, 250)

    pts1 =np.argwhere(canny>0)
    y11,x11 = pts1.min(axis=0)
    y21,x21 = pts1.max(axis=0)

    cropped = img[y11:y21, x11:x21]
    #cv2.imwrite("cropped.png", cropped)
    resizedimage = cv2.resize(cropped, (1080, 720), interpolation=cv2.INTER_CUBIC) #actual 
    cv2.imwrite('resizedimage' +filepath + '.jpeg', resizedimage)

    img = cv2.imread('resizedimage' +filepath + '.jpeg')
    h1, w1, _= img.shape

   

    resizedimage = Image.open('resizedimage' +filepath + '.jpeg')
    box1 = (0,0, w1, 0.90*h1)

    resizedimage = resizedimage.crop(box1)
    resizedimage.save('resizedimage' +filepath + '.jpeg')

    img = cv2.imread('resizedimage' +filepath + '.jpeg')
    h2, w2, _ = img.shape

    print ((h2, w2), "reso")

    croppedimg2 = Image.open('resizedimage' +filepath + '.jpeg')
    box2 = (0, 0.65*h2, 0.6*w2, h2)

    croppedimg2 = croppedimg2.crop(box2)
    croppedimg2.save('croppedimg2' + filepath + '.jpeg')

    aadharBoxes = pytesseract.image_to_boxes(croppedimg2, lang = "eng")

    for b in aadharBoxes.splitlines():
        b = b.split(' ')
        maskedImage = cv2.rectangle(img, (int(b[1]), h2 - int(b[2])), (int(b[3]), h2 - int(b[4])), (0, 0, 0), -1)
        #print(b, "coords")
        cv2.imwrite("maskedImage" + filepath + ".jpeg", maskedImage)
        pdf = img2pdf.convert("maskedImage"+ filepath + ".jpeg")
        file = open("D:/dataset/allpdf/masked_files/masked" + filepath + ".pdf", "wb")
        file.write(pdf)
        file.close()
        


    #print(w,h)
    os.remove('out' + filepath + '.jpg')
    os.remove('resizedimage' + filepath + '.jpeg')
    os.remove('maskedImage' + filepath + '.jpeg')
    os.remove('croppedimg2' + filepath + '.jpeg')
EN

回答 1

Stack Overflow用户

发布于 2021-03-18 13:44:00

要选择前八个字符,该范围之后的字符将被阻止。

代码语言:javascript
复制
import pytesseract
import numpy as np
import cv2

img = cv2.imread('muTYX.jpg')

pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

text = pytesseract.image_to_boxes(img, lang = 'eng', config='--psm 7 --oem 3')

text = text.split('\n')
high = []
for i in range(8):
    text[i] = text[i].split(' ')
    high.append(int(text[i][2]))
    high.append(int(text[i][4]))

cv2.rectangle(img, (int(text[0][1]), np.min(high)), (int(text[7][3]), np.max(high)), (0, 255, 0), 2)
cv2.imshow('nubmer',img)
cv2.waitKey(0)

票数 0
EN
页面原文内容由Stack Overflow提供。腾讯云小微IT领域专用引擎提供翻译支持
原文链接:

https://stackoverflow.com/questions/66685205

复制
相关文章

相似问题

领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档