При объединении слов учитываются параграфы

This commit is contained in:
2023-03-10 21:20:35 +03:00
parent 1fc7726501
commit eeba470caa

63
main.py
View File

@@ -1,5 +1,4 @@
import time import time
import random
import math import math
import logging import logging
import numpy as np import numpy as np
@@ -90,11 +89,15 @@ def get_screenshot():
def get_boxes(screenshot): def get_boxes(screenshot):
inverted_screenshot = cv2.bitwise_not(
cv2.cvtColor(screenshot, cv2.COLOR_BGR2GRAY)
)
raw_boxes = pytesseract.image_to_data( raw_boxes = pytesseract.image_to_data(
screenshot, inverted_screenshot,
lang="eng", lang="eng",
output_type=Output.DATAFRAME, output_type=Output.DATAFRAME,
config="--psm 3 -c preserve_interword_spaces=1", config="--psm 3",
) )
if save_result: if save_result:
@@ -123,35 +126,45 @@ def union_boxes(base_boxes):
"text", "text",
"block_num", "block_num",
"line_num", "line_num",
"par_num",
] ]
) )
for box in base_boxes["block_num"].unique(): for box in base_boxes["block_num"].unique():
words_in_blocks = base_boxes.loc[base_boxes["block_num"] == box] paragraphs_in_box = base_boxes.loc[base_boxes["block_num"] == box][
"par_num"
].unique()
for paragraph in paragraphs_in_box:
words_in_paragraph = base_boxes.loc[
(base_boxes["block_num"] == box)
& (base_boxes["par_num"] == paragraph),
]
grouped_words = words_in_blocks.groupby("line_num", as_index=False) grouped_words = words_in_paragraph.groupby(
"line_num", as_index=False
)
box_phrases = grouped_words["width"].sum() box_phrases = grouped_words["width"].sum()
box_phrases = box_phrases.merge( box_phrases = box_phrases.merge(
grouped_words["height"].max(), on="line_num", how="left" grouped_words["height"].max(), on="line_num", how="left"
) )
box_phrases = box_phrases.merge( box_phrases = box_phrases.merge(
grouped_words["left"].min(), on="line_num", how="left" grouped_words["left"].min(), on="line_num", how="left"
) )
box_phrases = box_phrases.merge( box_phrases = box_phrases.merge(
grouped_words["top"].min(), on="line_num", how="left" grouped_words["top"].min(), on="line_num", how="left"
) )
box_phrases = box_phrases.merge( box_phrases = box_phrases.merge(
grouped_words["text"].apply(" ".join), grouped_words["text"].apply(" ".join),
on="line_num", on="line_num",
how="left", how="left",
) )
box_phrases["block_num"] = box box_phrases["block_num"] = box
rightest_box = words_in_blocks.loc[ rightest_box = words_in_paragraph.loc[
words_in_blocks["left"] == words_in_blocks["left"].max() words_in_paragraph["left"] == words_in_paragraph["left"].max()
] ]
leftest_box = words_in_blocks.loc[ leftest_box = words_in_paragraph.loc[
words_in_blocks["left"] == words_in_blocks["left"].min() words_in_paragraph["left"] == words_in_paragraph["left"].min()
] ]
box_phrases["width"] = ( box_phrases["width"] = (
rightest_box.iloc[0].left rightest_box.iloc[0].left