Skip to content Skip to sidebar Skip to footer

Identify Text Data In Image To Read Mm/dd, Description And Amount Using Opencv Python

import re import cv2 import pytesseract from pytesseract import Output from PIL import Image from pytesseract import image_to_string img = cv2.imread('/home/cybe

Solution 1:

To identify the text in the image, you must preprocess the image. To do this, we can remove the horizontal and vertical grid lines then throw the image into Pytesseract OCR. Here's the detected lines to be removed highlighted in green:

enter image description here

Result

enter image description here

Output from Pytesseract

ELECTRONICWITHDRAWALSDATEDESCRIPTIONAMOUNT01/02 Merchant Service Merch Fee 8030996550 CCD ID:1841010148$34.3001/03 Authnet Gateway Billing 104820413 CCD ID:187056856922.9501/0401/04OnlineTransferToMma...4622Transaction#:7794410276200.0001/0801/08OnlinePayment7732727073ToCleaningConnoisseur300.0001/1101/11OnlinePayment7744233248ToRamseySweis148.8001/1101/11OnlineTransferToMma...4622Transaction#:7816805988200.0001/1101/11PaymentToChaseCardEndingIN2342 500.0001/11AqabaHoldings,Inahl-51 Ahl-51 CCD ID:11137200481,441.2101/1401/12PaymentToChaseCardEndingIN2342 1,000.0001/1401/12OnlineTransferToMma...4622Transaction#:78410267191,000.0001/1601/16PaymentToChaseCardEndingIN2342 1,000.0001/1601/16OnlinePayment7852542882ToOakhurstGolf&CountryClub495.0001/1701/17OnlinePayment7762351731ToRamseySweis399.0501/1801/18OnlineTransferToMma...4622Transaction#:7837118990200.0001/18 Small Business Icpayment PPD ID:1131414876302.2401/2201/21PaymentToChaseCardEndingIN2342 1,000.0001/2301/23PaymentToChaseCardEndingIN2342 1,000.0001/2501/25OnlinePayment7786855117ToLvOfficeLimitedPartnership644.40

Code

import cv2
import pytesseract
import numpy as np

pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

image = cv2.imread('1.jpg')
result = image.copy()
gray = cv2.cvtColor(image,cv2.COLOR_BGR2GRAY)
thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]

# Find horizontal lines
horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (35,1))
detect_horizontal = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, horizontal_kernel, iterations=2)
cnts = cv2.findContours(detect_horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
cnts = cnts[0] iflen(cnts) == 2else cnts[1]
for c in cnts:
    cv2.drawContours(result, [c], -1, (255,255,255), -1)

# Find vertical lines
vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1,15))
detect_vertical = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, vertical_kernel, iterations=2)
cnts = cv2.findContours(detect_vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
cnts = cnts[0] iflen(cnts) == 2else cnts[1]
for c in cnts:
    cv2.drawContours(result, [c], -1, (255,255,255), -1)

data = pytesseract.image_to_string(result, lang='eng',config='--psm 6')
print(data)

cv2.imshow('thresh', thresh)
cv2.imshow('result', result)
cv2.waitKey()

Post a Comment for "Identify Text Data In Image To Read Mm/dd, Description And Amount Using Opencv Python"