PDF図面内の日本語を英語に自動変換してみた（Pythonを使う）

2024年10月23日 21:45

タイトルの通りなのですが、ちょっとやってみたくなったので試行してみました。

Chat-GPTにぶん投げる（プロンプト）

こんな感じで投げてみました。

図面（PDFファイル）を読み込んで、図面内に記載されている日本語を英語に変換し、その結果を図面内に反映させてください。その際、元に書かれている日本語は消してしまってください。反映させる際は、なるべく日本語で記載されていた位置の近くに配置して欲しいですが、なるべく図面の絵に文字がかからないように配慮してください。 Pythonで実現したいと思います。コードを提示できますか？

出典元：自分

で、2～3回のやり取りの末、こうなりました。
本pythonスクリプトのある位置（同フォルダ内）に変換したい図面（test.pdf）を予め配置してください。それで本スクリプトを実行すると、translated_output.pdfが生成される仕様です。

日本語から英語の変換はgoogletransで行います。translate.google.com の Web API を使用する非公式ライブラリで、無料・無制限のPython ライブラリとなります。

import fitz  # PyMuPDF
from googletrans import Translator
from pdf2image import convert_from_path
import cv2
import numpy as np
import re
import logging

# ログ設定
logging.basicConfig(filename='process.log', level=logging.INFO)

# 日本語の判定関数
def contains_japanese(text):
    return re.search(r'[\u3040-\u30FF\u4E00-\u9FFF]', text) is not None

# 重なりを避けて文字を配置する関数
def find_non_overlapping_position(image, rect, placed_text_rects, margin=20, font_size=7):
    x0, y0, x1, y1 = int(rect[0]), int(rect[1]), int(rect[2]), int(rect[3])
    initial_font_size = font_size

    for attempt in range(500):  # 最大500回調整
        print(f"試行回数: {attempt + 1}回目")
        logging.info(f"試行回数: {attempt + 1}回目")  # ログに出力

        # 他のテキストと重なりがない場合
        if all(not rect.intersects(placed_rect) for placed_rect in placed_text_rects):
            expanded_rect = [max(0, y0 - margin), max(0, y1 + margin), min(image.shape[0], x0 - margin), min(image.shape[1], x1 + margin)]
            if np.any(image[expanded_rect[0]:expanded_rect[1], expanded_rect[2]:expanded_rect[3]] < 250):  # 白色以外があるかチェック
                return rect, font_size

        # 配置位置を上下左右にずらす
        y0 += margin
        y1 += margin
        if all(not rect.intersects(placed_rect) for placed_rect in placed_text_rects):
            x0 -= margin
            x1 -= margin
        if all(not rect.intersects(placed_rect) for placed_rect in placed_text_rects):
            x0 += margin * 2
            x1 += margin * 2
            
        # 試行回数が一定を超えたらフォントサイズを小さくする
        if attempt > 500:
            font_size = max(5, font_size - 1)  # フォントサイズが5未満にはならない
            if font_size < initial_font_size:
                print(f"フォントサイズを {font_size} に調整しています。")
                logging.info(f"フォントサイズを {font_size} に調整しています。")

    # 位置が見つからなかった場合
    print("位置を見つけられませんでした。")
    logging.info("位置を見つけられませんでした。")
    return rect, font_size

# PDFファイルの読み込みとページ画像の作成
pdf_file = "test.pdf"
output_pdf = "translated_output.pdf"
translator = Translator()

# PDF全ページを画像に変換 (高解像度指定)
images = convert_from_path(pdf_file, dpi=300)

# PDFを開く
doc = fitz.open(pdf_file)

# 全てのページを処理
for page_num, page_image in enumerate(images):
    page = doc.load_page(page_num)
    blocks = page.get_text("blocks")

    # OpenCV形式に変換
    page_image_cv = cv2.cvtColor(np.array(page_image), cv2.COLOR_RGB2BGR)
    gray_image = cv2.cvtColor(page_image_cv, cv2.COLOR_BGR2GRAY)

    # 配置済み英語テキストの位置を追跡するリスト
    placed_text_rects = []

    for block in blocks:
        rect = fitz.Rect(block[:4])  # テキストの位置情報
        text = block[4]  # テキスト内容

        print(f"ページ {page_num + 1} のテキスト: {text}")

        # 日本語テキストが含まれている場合
        if contains_japanese(text):
            # 翻訳
            translated_text = translator.translate(text, src='ja', dest='en').text
            page.add_redact_annot(rect, fill=(1, 1, 1))  # 元の日本語を削除
            page.apply_redactions()

            # 重なりがない位置を探す
            suitable_rect, font_size = find_non_overlapping_position(gray_image, rect, placed_text_rects)

            # テキストを挿入
            page.insert_text((suitable_rect.x0, suitable_rect.y0), translated_text, fontsize=font_size, color=(0, 0, 0))

            # テキストを挿入後に位置を追跡リストに追加
            placed_text_rects.append(suitable_rect)

# 結果を保存
doc.save(output_pdf)
doc.close()

print(f"変換が完了しました: {output_pdf}")
logging.info(f"変換が完了しました: {output_pdf}")

変換前の図面

変換後の図面

変換処理に私のPCでは60秒程度かかりました。

おわりに

今回のコードでは文字どうしの重なりを避ける処理を入れていますので、思ったより良くなっています。もう少し試行してみたいと思います。