Identify Text Data In Image To Read Mm/dd, Description And Amount Using Opencv Python
import re import cv2 import pytesseract from pytesseract import Output from PIL import Image from pytesseract import image_to_string img = cv2.imread('/home/cybe
Solution 1:
To identify the text in the image, you must preprocess the image. To do this, we can remove the horizontal and vertical grid lines then throw the image into Pytesseract OCR. Here's the detected lines to be removed highlighted in green:
Result
Output from Pytesseract
ELECTRONICWITHDRAWALSDATEDESCRIPTIONAMOUNT01/02 Merchant Service Merch Fee 8030996550 CCD ID:1841010148$34.3001/03 Authnet Gateway Billing 104820413 CCD ID:187056856922.9501/0401/04OnlineTransferToMma...4622Transaction#:7794410276200.0001/0801/08OnlinePayment7732727073ToCleaningConnoisseur300.0001/1101/11OnlinePayment7744233248ToRamseySweis148.8001/1101/11OnlineTransferToMma...4622Transaction#:7816805988200.0001/1101/11PaymentToChaseCardEndingIN2342 500.0001/11AqabaHoldings,Inahl-51 Ahl-51 CCD ID:11137200481,441.2101/1401/12PaymentToChaseCardEndingIN2342 1,000.0001/1401/12OnlineTransferToMma...4622Transaction#:78410267191,000.0001/1601/16PaymentToChaseCardEndingIN2342 1,000.0001/1601/16OnlinePayment7852542882ToOakhurstGolf&CountryClub495.0001/1701/17OnlinePayment7762351731ToRamseySweis399.0501/1801/18OnlineTransferToMma...4622Transaction#:7837118990200.0001/18 Small Business Icpayment PPD ID:1131414876302.2401/2201/21PaymentToChaseCardEndingIN2342 1,000.0001/2301/23PaymentToChaseCardEndingIN2342 1,000.0001/2501/25OnlinePayment7786855117ToLvOfficeLimitedPartnership644.40
Code
import cv2
import pytesseract
import numpy as np
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
image = cv2.imread('1.jpg')
result = image.copy()
gray = cv2.cvtColor(image,cv2.COLOR_BGR2GRAY)
thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
# Find horizontal lines
horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (35,1))
detect_horizontal = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, horizontal_kernel, iterations=2)
cnts = cv2.findContours(detect_horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
cnts = cnts[0] iflen(cnts) == 2else cnts[1]
for c in cnts:
cv2.drawContours(result, [c], -1, (255,255,255), -1)
# Find vertical lines
vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1,15))
detect_vertical = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, vertical_kernel, iterations=2)
cnts = cv2.findContours(detect_vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
cnts = cnts[0] iflen(cnts) == 2else cnts[1]
for c in cnts:
cv2.drawContours(result, [c], -1, (255,255,255), -1)
data = pytesseract.image_to_string(result, lang='eng',config='--psm 6')
print(data)
cv2.imshow('thresh', thresh)
cv2.imshow('result', result)
cv2.waitKey()
Post a Comment for "Identify Text Data In Image To Read Mm/dd, Description And Amount Using Opencv Python"