You've already forked MineEVECraft
mirror of
https://github.com/Llloooggg/MineEVECraft.git
synced 2026-03-06 03:36:24 +03:00
При объединении слов учитываются параграфы
This commit is contained in:
63
main.py
63
main.py
@@ -1,5 +1,4 @@
|
|||||||
import time
|
import time
|
||||||
import random
|
|
||||||
import math
|
import math
|
||||||
import logging
|
import logging
|
||||||
import numpy as np
|
import numpy as np
|
||||||
@@ -90,11 +89,15 @@ def get_screenshot():
|
|||||||
|
|
||||||
|
|
||||||
def get_boxes(screenshot):
|
def get_boxes(screenshot):
|
||||||
|
inverted_screenshot = cv2.bitwise_not(
|
||||||
|
cv2.cvtColor(screenshot, cv2.COLOR_BGR2GRAY)
|
||||||
|
)
|
||||||
|
|
||||||
raw_boxes = pytesseract.image_to_data(
|
raw_boxes = pytesseract.image_to_data(
|
||||||
screenshot,
|
inverted_screenshot,
|
||||||
lang="eng",
|
lang="eng",
|
||||||
output_type=Output.DATAFRAME,
|
output_type=Output.DATAFRAME,
|
||||||
config="--psm 3 -c preserve_interword_spaces=1",
|
config="--psm 3",
|
||||||
)
|
)
|
||||||
|
|
||||||
if save_result:
|
if save_result:
|
||||||
@@ -123,35 +126,45 @@ def union_boxes(base_boxes):
|
|||||||
"text",
|
"text",
|
||||||
"block_num",
|
"block_num",
|
||||||
"line_num",
|
"line_num",
|
||||||
|
"par_num",
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
for box in base_boxes["block_num"].unique():
|
for box in base_boxes["block_num"].unique():
|
||||||
words_in_blocks = base_boxes.loc[base_boxes["block_num"] == box]
|
paragraphs_in_box = base_boxes.loc[base_boxes["block_num"] == box][
|
||||||
|
"par_num"
|
||||||
|
].unique()
|
||||||
|
for paragraph in paragraphs_in_box:
|
||||||
|
words_in_paragraph = base_boxes.loc[
|
||||||
|
(base_boxes["block_num"] == box)
|
||||||
|
& (base_boxes["par_num"] == paragraph),
|
||||||
|
]
|
||||||
|
|
||||||
grouped_words = words_in_blocks.groupby("line_num", as_index=False)
|
grouped_words = words_in_paragraph.groupby(
|
||||||
|
"line_num", as_index=False
|
||||||
|
)
|
||||||
|
|
||||||
box_phrases = grouped_words["width"].sum()
|
box_phrases = grouped_words["width"].sum()
|
||||||
box_phrases = box_phrases.merge(
|
box_phrases = box_phrases.merge(
|
||||||
grouped_words["height"].max(), on="line_num", how="left"
|
grouped_words["height"].max(), on="line_num", how="left"
|
||||||
)
|
)
|
||||||
box_phrases = box_phrases.merge(
|
box_phrases = box_phrases.merge(
|
||||||
grouped_words["left"].min(), on="line_num", how="left"
|
grouped_words["left"].min(), on="line_num", how="left"
|
||||||
)
|
)
|
||||||
box_phrases = box_phrases.merge(
|
box_phrases = box_phrases.merge(
|
||||||
grouped_words["top"].min(), on="line_num", how="left"
|
grouped_words["top"].min(), on="line_num", how="left"
|
||||||
)
|
)
|
||||||
box_phrases = box_phrases.merge(
|
box_phrases = box_phrases.merge(
|
||||||
grouped_words["text"].apply(" ".join),
|
grouped_words["text"].apply(" ".join),
|
||||||
on="line_num",
|
on="line_num",
|
||||||
how="left",
|
how="left",
|
||||||
)
|
)
|
||||||
box_phrases["block_num"] = box
|
box_phrases["block_num"] = box
|
||||||
|
|
||||||
rightest_box = words_in_blocks.loc[
|
rightest_box = words_in_paragraph.loc[
|
||||||
words_in_blocks["left"] == words_in_blocks["left"].max()
|
words_in_paragraph["left"] == words_in_paragraph["left"].max()
|
||||||
]
|
]
|
||||||
leftest_box = words_in_blocks.loc[
|
leftest_box = words_in_paragraph.loc[
|
||||||
words_in_blocks["left"] == words_in_blocks["left"].min()
|
words_in_paragraph["left"] == words_in_paragraph["left"].min()
|
||||||
]
|
]
|
||||||
box_phrases["width"] = (
|
box_phrases["width"] = (
|
||||||
rightest_box.iloc[0].left
|
rightest_box.iloc[0].left
|
||||||
|
|||||||
Reference in New Issue
Block a user