The images I'm trying to mask look like this. 我正在使用tesseract作为一个库来迭代多个图像,并识别其中的数字,然后对这些数字进行掩码。我在每个图像中有12个数字,我希望它在第8个数字之后停止识别/掩蔽,也就是说,只有8个数字需要被掩蔽,我尝试通过硬编码来实现这一点,因为它们中的大多数都有点相似,但是稍微偏离的图像会被搞乱。有没有办法阻止tesseract在8位数字之后屏蔽这些图像?我正在使用cv2.rectangle来遮罩图像。
pytesseract.pytesseract.tesseract_cmd = "D:/Tess/tesseract.exe"
for imgfilepathactual in glob.iglob('D:/dataset/allpdf/data/*.jpeg'):
imgfilepath2 = imgfilepathactual.split("/")[3]
imgfilepath1 = imgfilepath2.split('\\')[1]
imgfilepath = imgfilepath1.split(".")[0]
#print(filepath)
print(imgfilepath)
img = cv2.imread('D:/dataset/allpdf/data/' + imgfilepath + '.jpeg', cv2.IMREAD_GRAYSCALE) #if using with pdf conv
#print(str(img))
sobelX1 = cv2.Sobel(img, cv2.CV_64F, 1, 0, ksize = 1)
sobelY1 = cv2.Sobel(img, cv2.CV_64F, 0, 1, ksize = 1)
sobelX1 = np.uint8(np.absolute(sobelX1))
sobelY1 = np.uint8(np.absolute(sobelY1))
sobelCombined1 = cv2.bitwise_or(sobelX1, sobelY1)
blurred = cv2.blur(sobelX1, (3,3)) #for pdf->img
canny = cv2.Canny(blurred, 5, 250)
pts1 =np.argwhere(canny>0)
y11,x11 = pts1.min(axis=0)
y21,x21 = pts1.max(axis=0)
cropped = img[y11:y21, x11:x21]
#cv2.imwrite("cropped.png", cropped)
resizedimage = cv2.resize(cropped, (1080, 720), interpolation=cv2.INTER_CUBIC) #actual
cv2.imwrite('resizedimage' + imgfilepath + '.jpeg', resizedimage)
img = cv2.imread('resizedimage' + imgfilepath + '.jpeg')
h1, w1, _= img.shape
resizedimage = Image.open('resizedimage' + imgfilepath + '.jpeg')
box1 = (0,0, w1, 0.90*h1)
resizedimage = resizedimage.crop(box1)
resizedimage.save('resizedimage' + imgfilepath + '.jpeg')
img = cv2.imread('resizedimage' + imgfilepath + '.jpeg')
h2, w2, _ = img.shape
print ((h2, w2), "reso")
croppedimg2 = Image.open('resizedimage' + imgfilepath + '.jpeg')
box2 = (0, 0.75*h2, 0.48*w2, h2)
croppedimg2 = croppedimg2.crop(box2)
croppedimg2.save('croppedimg2' + imgfilepath + '.jpeg')
aadharBoxes = pytesseract.image_to_boxes(croppedimg2, lang = "eng")#, config=' --psm 7 -c tessedit_char_whitelist=0123456789abcdefghijklmnopqrstuvwxyz'
#adharBoxes = aadharBoxes[0:8]
for b in aadharBoxes.splitlines():
b = b.split(' ')
high = []
for i in range(8):
b[i] = b[i].split(' ')
high.append(int(b[i][2]))
high.append(int(b[i][4]))
#b = b[0:8]
print(b)
#print(len(b[0]))
maskedImage = cv2.rectangle(img, (int(b[0][1]), np.min(high)), (int(b[7][3]), np.max(high)), (0, 0, 0), -1)
cv2.imwrite("maskedImage" + imgfilepath + ".jpeg", maskedImage)
pdf = img2pdf.convert("maskedImage"+ imgfilepath + ".jpeg")
file = open("D:/dataset/allpdf/masked_files/masked" + imgfilepath + ".pdf", "wb")
file.write(pdf)
file.close()
#else:
#pdf = img2pdf.convert("unmaskedImage"+ resizedimage + ".jpeg")
#file = open("D:/dataset/allpdf/masked_files/masked" + imgfilepath + ".pdf", "wb")
#file.write(pdf)
#file.close()
os.remove('resizedimage' + imgfilepath + '.jpeg')
#os.remove('maskedImage' + imgfilepath + '.jpeg')
#os.remove('croppedimg2' + imgfilepath + '.jpeg')
for filepathactual in glob.iglob('D:/dataset/allpdf/*.pdf'):
#print(filepathactual)
filepath2 = filepathactual.split("/")[2]
filepath1 = filepath2.split("\\")[1]
filepath = filepath1.split(".")[0]
print(filepath)
def convertPdf2img():
pages = convert_from_path(filepathactual, 500) #converting pdf to img
for page in pages:
page.save('out' + filepath + '.jpg', 'JPEG')
convertPdf2img()
img = cv2.imread('out' + filepath + '.jpg', cv2.IMREAD_GRAYSCALE) #if using with pdf conv
sobelX1 = cv2.Sobel(img, cv2.CV_64F, 1, 0, ksize = 1)
sobelY1 = cv2.Sobel(img, cv2.CV_64F, 0, 1, ksize = 1)
sobelX1 = np.uint8(np.absolute(sobelX1))
sobelY1 = np.uint8(np.absolute(sobelY1))
sobelCombined1 = cv2.bitwise_or(sobelX1, sobelY1)
blurred = cv2.blur(img, (3,3)) #for pdf->img
canny = cv2.Canny(blurred, 5, 250)
pts1 =np.argwhere(canny>0)
y11,x11 = pts1.min(axis=0)
y21,x21 = pts1.max(axis=0)
cropped = img[y11:y21, x11:x21]
#cv2.imwrite("cropped.png", cropped)
resizedimage = cv2.resize(cropped, (1080, 720), interpolation=cv2.INTER_CUBIC) #actual
cv2.imwrite('resizedimage' +filepath + '.jpeg', resizedimage)
img = cv2.imread('resizedimage' +filepath + '.jpeg')
h1, w1, _= img.shape
resizedimage = Image.open('resizedimage' +filepath + '.jpeg')
box1 = (0,0, w1, 0.90*h1)
resizedimage = resizedimage.crop(box1)
resizedimage.save('resizedimage' +filepath + '.jpeg')
img = cv2.imread('resizedimage' +filepath + '.jpeg')
h2, w2, _ = img.shape
print ((h2, w2), "reso")
croppedimg2 = Image.open('resizedimage' +filepath + '.jpeg')
box2 = (0, 0.65*h2, 0.6*w2, h2)
croppedimg2 = croppedimg2.crop(box2)
croppedimg2.save('croppedimg2' + filepath + '.jpeg')
aadharBoxes = pytesseract.image_to_boxes(croppedimg2, lang = "eng")
for b in aadharBoxes.splitlines():
b = b.split(' ')
maskedImage = cv2.rectangle(img, (int(b[1]), h2 - int(b[2])), (int(b[3]), h2 - int(b[4])), (0, 0, 0), -1)
#print(b, "coords")
cv2.imwrite("maskedImage" + filepath + ".jpeg", maskedImage)
pdf = img2pdf.convert("maskedImage"+ filepath + ".jpeg")
file = open("D:/dataset/allpdf/masked_files/masked" + filepath + ".pdf", "wb")
file.write(pdf)
file.close()
#print(w,h)
os.remove('out' + filepath + '.jpg')
os.remove('resizedimage' + filepath + '.jpeg')
os.remove('maskedImage' + filepath + '.jpeg')
os.remove('croppedimg2' + filepath + '.jpeg')发布于 2021-03-18 13:44:00
要选择前八个字符,该范围之后的字符将被阻止。
import pytesseract
import numpy as np
import cv2
img = cv2.imread('muTYX.jpg')
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
text = pytesseract.image_to_boxes(img, lang = 'eng', config='--psm 7 --oem 3')
text = text.split('\n')
high = []
for i in range(8):
text[i] = text[i].split(' ')
high.append(int(text[i][2]))
high.append(int(text[i][4]))
cv2.rectangle(img, (int(text[0][1]), np.min(high)), (int(text[7][3]), np.max(high)), (0, 255, 0), 2)
cv2.imshow('nubmer',img)
cv2.waitKey(0)


https://stackoverflow.com/questions/66685205
复制相似问题