macOS自带OCR识别图片文字

2024-10-12
#Unix #Python

在 macOS 系统,用 Python 语言和自带的 OCR 功能识别图片,包括中文简体和繁体、横版或竖版,结果保存到一个txt文档。

import os
import Vision
import Quartz
from Cocoa import NSURL
from PIL import Image, ImageOps, ImageFilter
import cv2
import numpy as np
import tempfile
from tqdm import tqdm  # 用于显示处理进度
import logging

# 设置输入和输出路径
input_dir = "/path/to/your/images"  # 输入图片目录
output_txt = "/path/to/output.txt"   # 输出文本文件路径

# 初始化日志
logging.basicConfig(filename="ocr_process.log", level=logging.INFO)

# 自动倾斜校正函数
def deskew_image(image_path):
    # 读取图像
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)

    # 二值化图像(使用自适应阈值法)
    _, binary_image = cv2.threshold(image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

    # 找到图像中的轮廓
    coords = np.column_stack(np.where(binary_image > 0))
    angle = cv2.minAreaRect(coords)[-1]

    # 调整角度,确保文本为水平或垂直方向
    if angle < -45:
        angle = -(90 + angle)
    else:
        angle = -angle

    # 计算旋转矩阵并应用到图像
    (h, w) = image.shape[:2]
    center = (w // 2, h // 2)
    M = cv2.getRotationMatrix2D(center, angle, 1.0)
    rotated_image = cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)

    # 保存校正后的图像到临时文件
    temp_file = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
    temp_path = temp_file.name
    cv2.imwrite(temp_path, rotated_image)
    
    return temp_path

# 图像预处理:自动倾斜校正、自适应阈值化、边框裁剪、灰度化等
def preprocess_image(image_path):
    # 自动倾斜校正
    corrected_image_path = deskew_image(image_path)

    # 打开校正后的图像
    img = Image.open(corrected_image_path)

    # 灰度化
    img = ImageOps.grayscale(img)

    # 自动边界裁剪(去除边缘空白)
    img = ImageOps.crop(img, border=10)

    # 自适应阈值化(二值化)
    img = ImageOps.autocontrast(img)

    # 锐化
    img = img.filter(ImageFilter.SHARPEN)

    # 去噪
    img = img.filter(ImageFilter.MedianFilter(size=3))

    # 保存到临时文件
    temp_file = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
    temp_path = temp_file.name
    img.save(temp_path)

    # 删除校正后的临时文件
    if os.path.exists(corrected_image_path):
        os.remove(corrected_image_path)

    return temp_path

# 处理单个图像并执行 OCR
def process_image(image_path):
    preprocessed_image_path = None
    try:
        preprocessed_image_path = preprocess_image(image_path)
        
        # 将图像转换为 CIImage
        image_url = NSURL.fileURLWithPath_(preprocessed_image_path)
        ci_image = Quartz.CIImage.imageWithContentsOfURL_(image_url)
        
        # 创建 OCR 请求
        ocr_request = Vision.VNRecognizeTextRequest.alloc().init()
        ocr_request.setRecognitionLevel_(Vision.VNRequestTextRecognitionLevelAccurate)
        ocr_request.setRecognitionLanguages_(["zh-Hans", "zh-Hant", "en"])  # 支持简体中文、繁体中文和英文
        ocr_request.setUsesLanguageCorrection_(True)

        # 创建处理程序
        handler = Vision.VNImageRequestHandler.alloc().initWithCIImage_options_(ci_image, None)
        success, error = handler.performRequests_error_([ocr_request], None)
        
        if not success:
            logging.error(f"Error processing {image_path}: {error}")
            return ""

        # 提取 OCR 结果
        recognized_text = []
        for observation in ocr_request.results():
            if isinstance(observation, Vision.VNRecognizedTextObservation):
                for candidate in observation.topCandidates_(1):
                    if candidate.confidence() >= 0.8:  # 过滤掉置信度低的文本
                        recognized_text.append(candidate.string())
        
        return "\n".join(recognized_text)

    except Exception as e:
        logging.error(f"Failed to process image {image_path}: {str(e)}")
        return ""

    finally:
        # 删除临时文件
        if preprocessed_image_path and os.path.exists(preprocessed_image_path):
            os.remove(preprocessed_image_path)

# 处理所有图像文件并保存结果
def process_all_images(input_dir, output_txt):
    # 获取目录中的图片文件,并按文件名排序
    image_files = sorted([os.path.join(input_dir, f) for f in os.listdir(input_dir)
                          if f.endswith(('.png', '.jpg', '.jpeg', '.tiff', '.bmp'))])

    with open(output_txt, 'w', encoding='utf-8') as f:
        for image_file in tqdm(image_files):
            result = process_image(image_file)
            f.write(f"File: {os.path.basename(image_file)}\n")
            f.write(f"{result}\n\n")
            logging.info(f"Processed {os.path.basename(image_file)}")

# 执行处理所有图像
process_all_images(input_dir, output_txt)
print("OCR 处理完成,结果已保存到 txt 文件。")