OCR(Optical Character Recognition, 광학 문자 인식)
이해하기
OCR(Optical Character Recognition:광학 문자 인식)은 스캔한 이미지나 사진에서 문자를 인식하여 컴퓨터에서 사용 가능한 텍스트로 변환하는 기술입니다.
시작하기
pytesseract 설치, 한글 언어 포함
참고
https://yunwoong.tistory.com/51
https://yunwoong.tistory.com/58?category=902345
https://yunwoong.tistory.com/72?category=902345
https://yunwoong.tistory.com/73?category=902345
from PIL import Image
import pytesseract
# Tesseract 실행 파일 경로를 설정 (Windows의 경우 필요)
pytesseract.pytesseract.tesseract_cmd = r'C:/programs/Tesseract-OCR/tesseract.exe'
# 이미지 파일 경로
image_path = 'C:/workspace/study/ai/02.embedding/dataset/00000000000001327999.jpg'
print('>>> image_path', image_path)
# 이미지 열기
image = Image.open(image_path)
print('>>> image', image)
# OCR을 사용하여 텍스트 추출
text = pytesseract.image_to_string(image)
print(text)
이미지 읽어와 테두리 자르고 문자 읽기
from imutils.perspective import four_point_transform
import matplotlib.pyplot as plt
import pytesseract
import imutils
import cv2
import re
import requests
import numpy as np
# Tesseract 실행 파일 경로를 설정 (Windows의 경우 필요)
pytesseract.pytesseract.tesseract_cmd = r'C:/programs/Tesseract-OCR/tesseract.exe'
def plt_imshow(title='image', img=None, figsize=(8 ,5)):
plt.figure(figsize=figsize)
if type(img) == list:
if type(title) == list:
titles = title
else:
titles = []
for i in range(len(img)):
titles.append(title)
for i in range(len(img)):
if len(img[i].shape) <= 2:
rgbImg = cv2.cvtColor(img[i], cv2.COLOR_GRAY2RGB)
else:
rgbImg = cv2.cvtColor(img[i], cv2.COLOR_BGR2RGB)
plt.subplot(1, len(img), i + 1), plt.imshow(rgbImg)
plt.title(titles[i])
plt.xticks([]), plt.yticks([])
plt.show()
else:
if len(img.shape) < 3:
rgbImg = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
else:
rgbImg = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
plt.imshow(rgbImg)
plt.title(title)
plt.xticks([]), plt.yticks([])
plt.show()
url = 'https://user-images.githubusercontent.com/69428232/148330274-237d9b23-4a79-4416-8ef1-bb7b2b52edc4.jpg'
image_nparray = np.asarray(bytearray(requests.get(url).content), dtype=np.uint8)
org_image = cv2.imdecode(image_nparray, cv2.IMREAD_COLOR)
plt_imshow("orignal image", org_image)
image = org_image.copy()
image = imutils.resize(image, width=500)
ratio = org_image.shape[1] / float(image.shape[1])
# 이미지를 grayscale로 변환하고 blur를 적용
# 모서리를 찾기위한 이미지 연산
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
blurred = cv2.GaussianBlur(gray, (5, 5,), 0)
edged = cv2.Canny(blurred, 75, 200)
plt_imshow(['gray', 'blurred', 'edged'], [gray, blurred, edged])
# contours를 찾아 크기순으로 정렬
cnts = cv2.findContours(edged.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
cnts = imutils.grab_contours(cnts)
cnts = sorted(cnts, key=cv2.contourArea, reverse=True)
receiptCnt = None
# 정렬된 contours를 반복문으로 수행하며 4개의 꼭지점을 갖는 도형을 검출
for c in cnts:
peri = cv2.arcLength(c, True)
approx = cv2.approxPolyDP(c, 0.02 * peri, True)
# contours가 크기순으로 정렬되어 있기때문에 제일 첫번째 사각형을 영수증 영역으로 판단하고 break
if len(approx) == 4:
receiptCnt = approx
break
# 만약 추출한 윤곽이 없을 경우 오류
if receiptCnt is None:
raise Exception(("Could not find receipt outline."))
output = image.copy()
cv2.drawContours(output, [receiptCnt], -1, (0, 255, 0), 2)
plt_imshow("Receipt Outline", output)
# 원본 이미지에 찾은 윤곽을 기준으로 이미지를 보정
receipt = four_point_transform(org_image, receiptCnt.reshape(4, 2) * ratio)
plt_imshow("Receipt Transform", receipt)
options = "--psm 4"
text = pytesseract.image_to_string(cv2.cvtColor(receipt, cv2.COLOR_BGR2RGB), config=options)
# OCR결과 출력
print("[INFO] OCR결과:")
print("==================")
print(text)
print("\n")
함수화
from imutils.perspective import four_point_transform
import matplotlib.pyplot as plt
import pytesseract
import imutils
import cv2
import re
import requests
import numpy as np
# Tesseract 실행 파일 경로를 설정 (Windows의 경우 필요)
pytesseract.pytesseract.tesseract_cmd = r'C:/programs/Tesseract-OCR/tesseract.exe'
def run_tesseract_ocr(image, width, ksize=(5,5), min_threshold=75, max_threshold=200, lang='eng'):
image_list_title = []
image_list = []
image = imutils.resize(image, width=width)
ratio = org_image.shape[1] / float(image.shape[1])
# 이미지를 grayscale로 변환하고 blur를 적용
# 모서리를 찾기위한 이미지 연산
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
blurred = cv2.GaussianBlur(gray, ksize, 0)
edged = cv2.Canny(blurred, min_threshold, max_threshold)
image_list_title = ['gray', 'blurred', 'edged']
image_list = [gray, blurred, edged]
# contours를 찾아 크기순으로 정렬
cnts = cv2.findContours(edged.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
cnts = imutils.grab_contours(cnts)
cnts = sorted(cnts, key=cv2.contourArea, reverse=True)
receiptCnt = None
# 정렬된 contours를 반복문으로 수행하며 4개의 꼭지점을 갖는 도형을 검출
for c in cnts:
peri = cv2.arcLength(c, True)
approx = cv2.approxPolyDP(c, 0.02 * peri, True)
# contours가 크기순으로 정렬되어 있기때문에 제일 첫번째 사각형을 영수증 영역으로 판단하고 break
if len(approx) == 4:
receiptCnt = approx
break
# 만약 추출한 윤곽이 없을 경우 오류
if receiptCnt is None:
raise Exception(("Could not find receipt outline."))
output = image.copy()
cv2.drawContours(output, [receiptCnt], -1, (0, 255, 0), 2)
image_list_title.append("Receipt Outline")
image_list.append(output)
# 원본 이미지에 찾은 윤곽을 기준으로 이미지를 보정
receipt = four_point_transform(org_image, receiptCnt.reshape(4, 2) * ratio)
plt_imshow(image_list_title, image_list)
plt_imshow("Receipt Transform", receipt)
options = "--psm 4"
text = pytesseract.image_to_string(cv2.cvtColor(receipt, cv2.COLOR_BGR2RGB), lang=lang, config=options)
# OCR결과 출력
print("[INFO] OCR결과:")
print("==================")
print(text)
def plt_imshow(title='image', img=None, figsize=(8 ,5)):
plt.figure(figsize=figsize)
if type(img) == list:
if type(title) == list:
titles = title
else:
titles = []
for i in range(len(img)):
titles.append(title)
for i in range(len(img)):
if len(img[i].shape) <= 2:
rgbImg = cv2.cvtColor(img[i], cv2.COLOR_GRAY2RGB)
else:
rgbImg = cv2.cvtColor(img[i], cv2.COLOR_BGR2RGB)
plt.subplot(1, len(img), i + 1), plt.imshow(rgbImg)
plt.title(titles[i])
plt.xticks([]), plt.yticks([])
plt.show()
else:
if len(img.shape) < 3:
rgbImg = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
else:
rgbImg = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
plt.imshow(rgbImg)
plt.title(title)
plt.xticks([]), plt.yticks([])
plt.show()
url = 'https://user-images.githubusercontent.com/69428232/155486780-55525c3c-8f5f-4313-8590-dd69d4ce4111.jpg'
image_nparray = np.asarray(bytearray(requests.get(url).content), dtype=np.uint8)
org_image = cv2.imdecode(image_nparray, cv2.IMREAD_COLOR)
run_tesseract_ocr(org_image, width=200, ksize=(5, 5), min_threshold=20, max_threshold=100, lang='kor+eng')
로컬 파일 읽어와 실행
from imutils.perspective import four_point_transform
from imutils.contours import sort_contours
import matplotlib.pyplot as plt
import pytesseract
import imutils
import cv2
import numpy as np
# Tesseract 실행 파일 경로를 설정 (Windows의 경우 필요)
pytesseract.pytesseract.tesseract_cmd = r'C:/programs/Tesseract-OCR/tesseract.exe'
def make_scan_image(image, width, ksize=(5,5), min_threshold=75, max_threshold=200):
image_list_title = []
image_list = []
org_image = image.copy()
image = imutils.resize(image, width=width)
ratio = org_image.shape[1] / float(image.shape[1])
# 이미지를 grayscale로 변환하고 blur를 적용
# 모서리를 찾기위한 이미지 연산
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
blurred = cv2.GaussianBlur(gray, ksize, 0)
edged = cv2.Canny(blurred, min_threshold, max_threshold)
image_list_title = ['gray', 'blurred', 'edged']
image_list = [gray, blurred, edged]
# contours를 찾아 크기순으로 정렬
cnts = cv2.findContours(edged.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
cnts = imutils.grab_contours(cnts)
cnts = sorted(cnts, key=cv2.contourArea, reverse=True)
findCnt = None
# 정렬된 contours를 반복문으로 수행하며 4개의 꼭지점을 갖는 도형을 검출
for c in cnts:
peri = cv2.arcLength(c, True)
approx = cv2.approxPolyDP(c, 0.02 * peri, True)
# contours가 크기순으로 정렬되어 있기때문에 제일 첫번째 사각형을 영역으로 판단하고 break
if len(approx) == 4:
findCnt = approx
break
# 만약 추출한 윤곽이 없을 경우 오류
if findCnt is None:
raise Exception("Could not find outline.")
output = image.copy()
cv2.drawContours(output, [findCnt], -1, (0, 255, 0), 2)
image_list_title.append("Outline")
image_list.append(output)
# 원본 이미지에 찾은 윤곽을 기준으로 이미지를 보정
transform_image = four_point_transform(org_image, findCnt.reshape(4, 2) * ratio)
plt_imshow(image_list_title, image_list)
plt_imshow("Transform", transform_image)
return transform_image
def plt_imshow(title='image', img=None, figsize=(8 ,5)):
plt.figure(figsize=figsize)
if type(img) == list:
if type(title) == list:
titles = title
else:
titles = []
for i in range(len(img)):
titles.append(title)
for i in range(len(img)):
if len(img[i].shape) <= 2:
rgbImg = cv2.cvtColor(img[i], cv2.COLOR_GRAY2RGB)
else:
rgbImg = cv2.cvtColor(img[i], cv2.COLOR_BGR2RGB)
plt.subplot(1, len(img), i + 1), plt.imshow(rgbImg)
plt.title(titles[i])
plt.xticks([]), plt.yticks([])
plt.show()
else:
if len(img.shape) < 3:
rgbImg = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
else:
rgbImg = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
plt.imshow(rgbImg)
plt.title(title)
plt.xticks([]), plt.yticks([])
plt.show()
# 로컬 이미지 파일 경로
image_path = 'C:/workspace/study/ai/02.embedding/dataset/ttttt.jpg'
org_image = cv2.imread(image_path)
if org_image is None:
raise Exception("Could not load image. Check the file path.")
business_card_image = make_scan_image(org_image, width=100, ksize=(5, 5), min_threshold=20, max_threshold=20)
options = "--psm 4"
text = pytesseract.image_to_string(business_card_image, config=options, lang='kor+eng')
# OCR결과 출력
print("[INFO] OCR결과:")
print("==================")
print(text)
print("\n")
상세 고도화
from imutils.perspective import four_point_transform
from imutils.contours import sort_contours
import matplotlib.pyplot as plt
import pytesseract
import imutils
import cv2
import re
import requests
import numpy as np
# Tesseract 실행 파일 경로를 설정 (Windows의 경우 필요)
pytesseract.pytesseract.tesseract_cmd = r'C:/programs/Tesseract-OCR/tesseract.exe'
def make_scan_image(image, width, ksize=(5,5), min_threshold=75, max_threshold=200):
image_list_title = []
image_list = []
org_image = image.copy()
image = imutils.resize(image, width=width)
ratio = org_image.shape[1] / float(image.shape[1])
# 이미지를 grayscale로 변환하고 blur를 적용
# 모서리를 찾기위한 이미지 연산
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
blurred = cv2.GaussianBlur(gray, ksize, 0)
edged = cv2.Canny(blurred, min_threshold, max_threshold)
image_list_title = ['gray', 'blurred', 'edged']
image_list = [gray, blurred, edged]
# contours를 찾아 크기순으로 정렬
cnts = cv2.findContours(edged.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
cnts = imutils.grab_contours(cnts)
cnts = sorted(cnts, key=cv2.contourArea, reverse=True)
findCnt = None
# 정렬된 contours를 반복문으로 수행하며 4개의 꼭지점을 갖는 도형을 검출
for c in cnts:
peri = cv2.arcLength(c, True)
approx = cv2.approxPolyDP(c, 0.02 * peri, True)
# contours가 크기순으로 정렬되어 있기때문에 제일 첫번째 사각형을 영역으로 판단하고 break
if len(approx) == 4:
findCnt = approx
break
# 만약 추출한 윤곽이 없을 경우 오류
if findCnt is None:
raise Exception(("Could not find outline."))
output = image.copy()
cv2.drawContours(output, [findCnt], -1, (0, 255, 0), 2)
image_list_title.append("Outline")
image_list.append(output)
# 원본 이미지에 찾은 윤곽을 기준으로 이미지를 보정
transform_image = four_point_transform(org_image, findCnt.reshape(4, 2) * ratio)
plt_imshow(image_list_title, image_list)
plt_imshow("Transform", transform_image)
return transform_image
def plt_imshow(title='image', img=None, figsize=(8 ,5)):
plt.figure(figsize=figsize)
if type(img) == list:
if type(title) == list:
titles = title
else:
titles = []
for i in range(len(img)):
titles.append(title)
for i in range(len(img)):
if len(img[i].shape) <= 2:
rgbImg = cv2.cvtColor(img[i], cv2.COLOR_GRAY2RGB)
else:
rgbImg = cv2.cvtColor(img[i], cv2.COLOR_BGR2RGB)
plt.subplot(1, len(img), i + 1), plt.imshow(rgbImg)
plt.title(titles[i])
plt.xticks([]), plt.yticks([])
plt.show()
else:
if len(img.shape) < 3:
rgbImg = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
else:
rgbImg = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
plt.imshow(rgbImg)
plt.title(title)
plt.xticks([]), plt.yticks([])
plt.show()
def mergeResize(img, row=300, col=200):
IMG_COL = col #66
# row값에 따른 col값 변경
IMG_COL = int((row * IMG_COL)/row)
IMG_ROW = row
border_v = 0
border_h = 0
if (IMG_COL / IMG_ROW) >= (img.shape[0] / img.shape[1]):
border_v = int((((IMG_COL / IMG_ROW) * img.shape[1]) - img.shape[0]) / 2)
else:
border_h = int((((IMG_ROW / IMG_COL) * img.shape[0]) - img.shape[1]) / 2)
img = cv2.copyMakeBorder(img, top=border_v, bottom=border_v, left=0, right=border_h + border_h, borderType=cv2.BORDER_CONSTANT, value=(255, 255, 255))
img = cv2.resize(img, (IMG_ROW, IMG_COL))
return img
url = 'https://user-images.githubusercontent.com/69428232/155486780-55525c3c-8f5f-4313-8590-dd69d4ce4111.jpg'
# url = 'https://user-images.githubusercontent.com/69428232/148330274-237d9b23-4a79-4416-8ef1-bb7b2b52edc4.jpg'
image_nparray = np.asarray(bytearray(requests.get(url).content), dtype=np.uint8)
org_image = cv2.imdecode(image_nparray, cv2.IMREAD_COLOR)
business_card_image = make_scan_image(org_image, width=200, ksize=(5, 5), min_threshold=20, max_threshold=100)
options = "--psm 4"
text = pytesseract.image_to_string(business_card_image, config=options, lang='kor+eng')
# OCR결과 출력
print("[INFO] OCR결과:")
print("==================")
print(text)
print("\n")
##########################################################################################
receipt = business_card_image
gray = cv2.cvtColor(receipt, cv2.COLOR_BGR2GRAY)
(H, W) = gray.shape
# rectKernel = cv2.getStructuringElement(cv2.MORPH_RECT, (30, 20))
# sqKernel = cv2.getStructuringElement(cv2.MORPH_RECT, (50, 21))
rectKernel = cv2.getStructuringElement(cv2.MORPH_RECT, (20,15))
sqKernel = cv2.getStructuringElement(cv2.MORPH_RECT, (30, 16))
gray = cv2.GaussianBlur(gray, (11, 11), 0)
blackhat = cv2.morphologyEx(gray, cv2.MORPH_BLACKHAT, rectKernel)
grad = cv2.Sobel(blackhat, ddepth=cv2.CV_32F, dx=1, dy=0, ksize=-1)
grad = np.absolute(grad)
(minVal, maxVal) = (np.min(grad), np.max(grad))
grad = (grad - minVal) / (maxVal - minVal)
grad = (grad * 255).astype("uint8")
grad = cv2.morphologyEx(grad, cv2.MORPH_CLOSE, rectKernel)
thresh = cv2.threshold(grad, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
close_thresh = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, sqKernel)
close_thresh = cv2.erode(close_thresh, None, iterations=2)
plt_imshow(["Original", "Blackhat", "Gradient", "Rect Close", "Square Close"], [receipt, blackhat, grad, thresh, close_thresh], figsize=(16, 10))
plt_imshow(["Square Close"], [close_thresh], figsize=(16, 10))
cnts = cv2.findContours(close_thresh.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
cnts = imutils.grab_contours(cnts)
cnts = sort_contours(cnts, method="top-to-bottom")[0]
roi_list = []
roi_title_list = []
margin = 20
receipt_grouping = receipt.copy()
for c in cnts:
(x, y, w, h) = cv2.boundingRect(c)
ar = w / float(h) # Modify this line for proper aspect ratio
# Debugging: Print contour information
# print(f"Contour - x: {x}, y: {y}, w: {w}, h: {h}, ar: {ar}")
# if ar > 3.0 and ar < 6.5 and (W/2) < x:
if ar > 0.5 and ar < 10.0 and (W/4) < x:
color = (0, 255, 0)
roi = receipt[y - margin:y + h + margin, x - margin:x + w + margin]
roi_list.append(roi)
roi_title_list.append("Roi_{}".format(len(roi_list)))
else:
color = (0, 0, 255)
cv2.rectangle(receipt_grouping, (x - margin, y - margin), (x + w + margin, y + h + margin), color, 2)
cv2.putText(receipt_grouping, "".join(str(ar)), (x, y - 15), cv2.FONT_HERSHEY_SIMPLEX, 0.65, color, 2)
# plt_imshow(["Grouping Image"], [receipt_grouping], figsize=(16, 10))
# plt_imshow(roi_title_list, roi_list, figsize=(16, 10))
# for roi in roi_list:
# gray_roi= cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)
# threshold_roi = cv2.threshold(gray_roi, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
# roi_text = pytesseract.image_to_string(threshold_roi)
# print(roi_text)
mergeImg = receipt_grouping
plt_imshow(["Merge Image"], [mergeImg])
for idx, roi in enumerate(roi_list):
if idx == 0:
mergeImg = mergeResize(roi)
else:
cropImg = mergeResize(roi)
mergeImg = np.concatenate((mergeImg, cropImg), axis=0)
threshold_mergeImg = cv2.threshold(mergeImg, 150, 255, cv2.THRESH_BINARY)[1]
plt_imshow(["Merge Image"], [threshold_mergeImg])
options = "--psm 4"
text = pytesseract.image_to_string(receipt, config=options, lang='kor+eng')
# OCR결과 출력
print("[INFO] OCR결과:")
print("==================")
print(text)
print("\n")
# tel = re.findall(r'(?:Tel )([\+\(]?[0-9][0-9 .\-\(\)]{8,}[0-9])', text)[0]
# mobile = re.findall(r'(?:Mobile )([\+\(]?[0-9][0-9 .\-\(\)]{8,}[0-9])', text)[0]
# emails = re.findall(r"[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+", text)[0]
addr = re.findall(r"[0-9\.\-+_]+\,.*", text)[0]
print("주소 : {}".format(addr))