CivilのKeTCindy教室: 画像のテキスト化

画像のテキスト化のpythonコード例

from PIL import Image

import pytesseract

# Tesseractのインストールパスを指定（例: Windowsの場合）

# pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

def extract_text_from_image(image_path):

# 画像を開く

img = Image.open(image_path)

# 画像からテキストを抽出

text = pytesseract.image_to_string(img, lang='eng') # lang='jpn+eng'日本語と英語の両方を使用

# 全角を半角に変換

#text = text.replace('’', "'") # 全角アポストロフィ → 半角アポストロフィ

text = text.replace('’', "'").replace('“', '"').replace('”', '"') # 必要に応じて追加

return text

# 使用例

image_path = 'sample.jpg'

text = extract_text_from_image(image_path)

print(text)

メニュー