개정판 e92b19bf
issue #655: white char
DTI_PID/DTI_PID/ConfigurationDialog.py | ||
---|---|---|
13 | 13 |
from AppDocData import Config |
14 | 14 |
from AppDocData import Color |
15 | 15 |
import Configuration_UI |
16 |
import tesseract_ocr_module as TOCR |
|
16 | 17 |
|
17 | 18 |
class ListView(QListView): |
18 | 19 |
def __init__(self, *args, **kwargs): |
... | ... | |
60 | 61 |
self.ui.spinBoxShrinkSize.setValue(int(configs[0].value)) if 1 == len(configs) else self.ui.spinBoxShrinkSize.setValue(0) |
61 | 62 |
configs = docData.getConfigs('Text Recognition', 'Merge Size') |
62 | 63 |
self.ui.spinBoxMergeSize.setValue(int(configs[0].value)) if 1 == len(configs) else self.ui.spinBoxMergeSize.setValue(10) |
64 |
configs = docData.getConfigs('Text Recognition', 'White Character List') |
|
65 |
self.ui.lineEditWhiteCharList.setText(configs[0].value) if 1 == len(configs) else self.ui.lineEditWhiteCharList.setText(TOCR.DEFAULT_CONF[40:]) |
|
63 | 66 |
|
64 | 67 |
configs = docData.getConfigs('Text Size', 'Min Text Size') |
65 | 68 |
self.ui.minTextSizeSpinBox.setValue(int(configs[0].value)) if 1 == len(configs) else self.ui.minTextSizeSpinBox.setValue(30) |
... | ... | |
473 | 476 |
configs.append(Config('Note No Tag Rule', 'Note No Expression', self.ui.lineEditNoteNoExpression.text())) |
474 | 477 |
configs.append(Config('OPC Tag No Rule', 'Description', self.ui.lineEditDescription.text())) |
475 | 478 |
configs.append(Config('OPC Tag No Rule', 'OPC Tag', self.ui.lineEditOPCTag.text())) |
479 |
configs.append(Config('Text Recognition', 'White Character List', self.ui.lineEditWhiteCharList.text())) |
|
476 | 480 |
|
477 | 481 |
# Add Line Color Option - 2018.07.06 by kyouho |
478 | 482 |
rbRandomValue = self.ui.radioButtonRandom.isChecked() |
DTI_PID/DTI_PID/MainWindow.py | ||
---|---|---|
257 | 257 |
files = appDocData.getDrawingFileList() |
258 | 258 |
for file in files: |
259 | 259 |
drawing = [drawing for drawing in drawings if drawing[1] == file] |
260 |
if not drawing[0]: |
|
260 |
if 0 == len(drawing) or not drawing[0]:
|
|
261 | 261 |
drawings.append([None, file, None]) |
262 | 262 |
|
263 |
item = QTreeWidgetItem(self.treeWidgetDrawingList.root, [file, drawing[0][2] if drawing[0] else '']) |
|
263 |
item = QTreeWidgetItem(self.treeWidgetDrawingList.root, [file, drawing[0][2] if drawing and drawing[0] else ''])
|
|
264 | 264 |
item.setIcon(0, QIcon(':newPrefix/image.png')) |
265 | 265 |
|
266 | 266 |
self.treeWidgetDrawingList.root.setText(0, 'P&ID Drawings({})'.format(self.treeWidgetDrawingList.root.childCount())) |
DTI_PID/DTI_PID/OcrResultDialog.py | ||
---|---|---|
85 | 85 |
self.graphicsView.setImage(self.image) |
86 | 86 |
|
87 | 87 |
''' |
88 |
@history 2018.04.26 Jeongwoo Add Rectangle with modified Coords |
|
89 |
2018.06.20 Jeongwoo Remove test code |
|
88 |
@history 2018.04.26 Jeongwoo Add Rectangle with modified Coords |
|
89 |
2018.06.20 Jeongwoo Remove test code |
|
90 |
2018.11.08 euisung add white char list check process on db |
|
90 | 91 |
''' |
91 | 92 |
def detectText(self): |
92 | 93 |
try: |
... | ... | |
136 | 137 |
#cv2.destroyAllWindows() |
137 | 138 |
|
138 | 139 |
# up to here |
139 |
self.textInfoList = TOCR.getTextInfo(img, (round(self.boundingBox.x()), round(self.boundingBox.y()))) |
|
140 |
docData = AppDocData.instance() |
|
141 |
whiteCharList = docData.getConfigs('Text Recognition', 'White Character List') |
|
142 |
if len(whiteCharList) is 0: |
|
143 |
self.textInfoList = TOCR.getTextInfo(img, (round(self.boundingBox.x()), round(self.boundingBox.y()))) |
|
144 |
else: |
|
145 |
self.textInfoList = TOCR.getTextInfo(img, (round(self.boundingBox.x()), round(self.boundingBox.y())), conf = whiteCharList[0].value) |
|
140 | 146 |
|
141 | 147 |
#self.textInfoList = TOCR.getTextInfo(img, (int(self.boundingBox.x()), int(self.boundingBox.y()))) |
142 | 148 |
if self.textInfoList is not None and len(self.textInfoList) > 0: |
DTI_PID/DTI_PID/TextDetector.py | ||
---|---|---|
163 | 163 |
@author humkyung |
164 | 164 |
@date 2018.07.24 |
165 | 165 |
@history change parameter updateProgressSignal to worker |
166 |
2018.11.08 euisung add white char list check process on db |
|
166 | 167 |
''' |
167 | 168 |
@staticmethod |
168 | 169 |
def recognizeTextFromImage(tInfo, imgOCR, offset, searchedSymbolList, worker, listWidget, maxProgressValue): |
... | ... | |
185 | 186 |
if 'Instrumentation' == category: tInfo.setAngle(0) |
186 | 187 |
# up to here |
187 | 188 |
|
188 |
resultTextInfo = TOCR.getTextInfo(img, (x, y), tInfo.getAngle()) |
|
189 |
whiteCharList = appDocData.getConfigs('Text Recognition', 'White Character List') |
|
190 |
if len(whiteCharList) is 0: |
|
191 |
resultTextInfo = TOCR.getTextInfo(img, (x, y), tInfo.getAngle()) |
|
192 |
else: |
|
193 |
resultTextInfo = TOCR.getTextInfo(img, (x, y), tInfo.getAngle(), conf = whiteCharList[0].value) |
|
194 |
|
|
189 | 195 |
if resultTextInfo is not None and len(resultTextInfo) > 0: |
190 | 196 |
for result in resultTextInfo: |
191 | 197 |
result.setX(result.getX() + round(offset[0])) |
... | ... | |
243 | 249 |
if onlyTextArea: |
244 | 250 |
return |
245 | 251 |
# parse texts in area except Drawing area |
252 |
whiteCharList = appDocData.getConfigs('Text Recognition', 'White Character List') |
|
246 | 253 |
for area in appDocData.getAreaList(): |
247 | 254 |
if area.name == 'Drawing': continue |
248 | 255 |
|
249 | 256 |
if area.name == 'Unit': |
250 | 257 |
img = imgSrc[round(area.y):round(area.y+area.height), round(area.x):round(area.x+area.width)] |
251 |
texts = TOCR.getTextInfo(img, (area.x, area.y), 0) |
|
258 |
if len(whiteCharList) is 0: |
|
259 |
texts = TOCR.getTextInfo(img, (area.x, area.y), 0) |
|
260 |
else: |
|
261 |
texts = TOCR.getTextInfo(img, (area.x, area.y), 0, conf = whiteCharList[0].value) |
|
252 | 262 |
if texts is not None and len(texts) > 0: |
253 | 263 |
appDocData.activeDrawing.setAttr('Unit', texts[0].getText()) |
254 | 264 |
self.otherTextInfoList.append([area.name, texts]) |
255 | 265 |
else: |
256 | 266 |
if area is not None and hasattr(area, 'img') and area.img is not None: |
257 |
texts = TOCR.getTextInfo(area.img, (area.x, area.y)) |
|
267 |
if len(whiteCharList) is 0: |
|
268 |
texts = TOCR.getTextInfo(area.img, (area.x, area.y)) |
|
269 |
else: |
|
270 |
texts = TOCR.getTextInfo(area.img, (area.x, area.y), conf = whiteCharList[0].value) |
|
258 | 271 |
self.otherTextInfoList.append([area.name, texts]) |
259 | 272 |
|
260 | 273 |
if worker is not None: worker.updateProgress.emit(maxProgressValue, None) |
... | ... | |
292 | 305 |
''' |
293 | 306 |
def recognizeTextInArea(self, imgSrc, area, angle=0): |
294 | 307 |
try: |
308 |
appDocData = AppDocData.instance() |
|
309 |
whiteCharList = appDocData.getConfigs('Text Recognition', 'White Character List') |
|
295 | 310 |
img = imgSrc[round(area[1]):round(area[1] + area[3]), round(area[0]):round(area[0] + area[2])] |
296 |
return TOCR.getTextInfo(img, (area[0], area[1])) |
|
311 |
if len(whiteCharList) is 0: |
|
312 |
return TOCR.getTextInfo(img, (area[0], area[1])) |
|
313 |
else: |
|
314 |
return TOCR.getTextInfo(img, (area[0], area[1]), conf = whiteCharList[0].value) |
|
297 | 315 |
except Exception as ex: |
298 | 316 |
print('error occured({}) in {}:{}'.format(ex, sys.exc_info()[-1].tb_frame.f_code.co_filename, sys.exc_info()[-1].tb_lineno)) |
299 | 317 |
|
DTI_PID/DTI_PID/TrainingImageListDialog.py | ||
---|---|---|
11 | 11 |
import pytesseract |
12 | 12 |
import TrainingImageList_UI |
13 | 13 |
from TrainingEditorDialog import QTrainingEditorDialog |
14 |
import tesseract_ocr_module as TOCR |
|
14 | 15 |
|
15 | 16 |
dataPath = os.path.join(os.getenv('ALLUSERSPROFILE'), 'Digital PID') |
16 | 17 |
tesseractPath = os.path.join(dataPath, 'Tesseract-OCR', 'tessdata') |
... | ... | |
24 | 25 |
cntraining_cmd = os.path.join(dataPath, 'Tesseract-OCR', 'cntraining.exe') |
25 | 26 |
combine_tessdata_cmd = os.path.join(dataPath, 'Tesseract-OCR', 'combine_tessdata.exe') |
26 | 27 |
|
27 |
DEFAULT_CONF = """ |
|
28 |
--psm 6 -c tessedit_char_whitelist=ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-~.,/!@#$%&*(){}[]<>:;+=?\\"\\' |
|
29 |
""" |
|
30 |
|
|
31 | 28 |
class QTrainingImageListDialog(QDialog): |
32 | 29 |
trainingDataNumber = 0 |
33 | 30 |
|
... | ... | |
438 | 435 |
trainingBoxPath = os.path.join(project.getTrainingFilePath(), boxName) |
439 | 436 |
boundaryOcrData = None |
440 | 437 |
if not isBoxFile: |
441 |
boundaryOcrData = pytesseract.image_to_boxes(drawing, config=DEFAULT_CONF, lang='seed+eng') |
|
438 |
docData = AppDocData.instance() |
|
439 |
oCRLang = docData.getCurrentProject().getName() if TOCR.isTrainedData() else 'eng' |
|
440 |
whiteCharList = docData.getConfigs('Text Recognition', 'White Character List') |
|
441 |
if len(whiteCharList) is 0: |
|
442 |
boundaryOcrData = pytesseract.image_to_boxes(drawing, config=TOCR.DEFAULT_CONF, lang=oCRLang) |
|
443 |
else: |
|
444 |
boundaryOcrData = pytesseract.image_to_boxes(drawing, config=TOCR.DEFAULT_CONF[:40] + whiteCharList[0].value, lang=oCRLang) |
|
442 | 445 |
|
443 | 446 |
except Exception as ex: |
444 | 447 |
print('error occured({}) in {}:{}'.format(ex, sys.exc_info()[-1].tb_frame.f_code.co_filename, sys.exc_info()[-1].tb_lineno)) |
DTI_PID/DTI_PID/tesseract_ocr_module.py | ||
---|---|---|
29 | 29 |
## Tesseract path |
30 | 30 |
''' |
31 | 31 |
@history Jeongwoo 2018.06.14 Tesseract path changed |
32 |
humkyung 2018.08.13 set tesseract executable path to relative of this file path |
|
32 |
humkyung 2018.08.13 set tesseract executable path to relative of this file path |
|
33 |
euisung set tesseract executable path to ProgramData |
|
33 | 34 |
''' |
34 | 35 |
#pytesseract.pytesseract.tesseract_cmd = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'Tesseract-OCR', 'tesseract.exe') |
35 | 36 |
pytesseract.pytesseract.tesseract_cmd = os.path.join(os.getenv('ALLUSERSPROFILE'), 'Digital PID', 'Tesseract-OCR', 'tesseract.exe') |
36 | 37 |
|
38 |
tesseract_path = os.path.join(os.getenv('ALLUSERSPROFILE'), 'Digital PID', 'Tesseract-OCR') |
|
37 | 39 |
DEFAULT_CONF = """ |
38 | 40 |
--psm 6 -c tessedit_char_whitelist=abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-~.,/!@#$%&*(){}[]<>:;+=?\\" |
39 | 41 |
""" |
40 | 42 |
|
43 |
def isTrainedData(): |
|
44 |
''' |
|
45 |
@brief check trained data is exist |
|
46 |
@author euisung |
|
47 |
@date 2018.11.08 |
|
48 |
''' |
|
49 |
docData = AppDocData.instance() |
|
50 |
prj_trained_data = os.path.join(tesseract_path, 'tessdata', docData.getCurrentProject().getName()+'.traineddata') |
|
51 |
if os.path.isfile(prj_trained_data): |
|
52 |
return True |
|
53 |
else: |
|
54 |
return False |
|
55 |
|
|
41 | 56 |
""" |
42 | 57 |
@history 2018.04.26 Jeongwoo Make TextInfo object with Calculated Coords (with BoundBox Coords) |
43 | 58 |
2018.04.30 Jeongwoo Add QRect.setHeight() in if-statement [(lineRect is not None and currentRect is not None) and lineRect.intersects(currentRect)] |
... | ... | |
49 | 64 |
2018.06.20 Jeongwoo Remove variable [lastCharHeight] / Change variable [cey], [ch] / Change method to calculate text line height |
50 | 65 |
humkyung 2018.10.12 change logic to extract text which first get bounding box and then extract character |
51 | 66 |
2018.10.19 euisung OCR lang change depend on project name |
52 |
2018.10.22 euisung system environment variable 'TESSDATA_PREFIX' change depend on project path |
|
67 |
2018.10.22 euisung system environment variable 'TESSDATA_PREFIX' change to ProgramData |
|
68 |
2018.11.08 euisung add config for OCR white char list |
|
53 | 69 |
""" |
54 |
def getTextInfo(img, startPoint, angle = 0, flag = FLAG_IMAGE_TO_BOXES, conf = DEFAULT_CONF):
|
|
70 |
def getTextInfo(img, startPoint, angle = 0, flag = FLAG_IMAGE_TO_BOXES, conf = None):
|
|
55 | 71 |
try: |
56 |
tesseract_path = os.path.join(os.getenv('ALLUSERSPROFILE'), 'Digital PID', 'Tesseract-OCR') |
|
57 | 72 |
os.environ['TESSDATA_PREFIX'] = tesseract_path |
58 | 73 |
textInfoList = [] |
59 | 74 |
|
75 |
if conf == None: |
|
76 |
conf = DEFAULT_CONF |
|
77 |
else: |
|
78 |
conf = DEFAULT_CONF[:40] + conf |
|
79 |
|
|
60 | 80 |
docData = AppDocData.instance() |
61 |
prj_trained_data = os.path.join(tesseract_path, 'tessdata', docData.getCurrentProject().getName()+'.traineddata') |
|
62 |
oCRLang = docData.getCurrentProject().getName() if os.path.isfile(prj_trained_data) else 'eng' |
|
63 |
#oCRLang = 'eng' |
|
81 |
oCRLang = docData.getCurrentProject().getName() if isTrainedData() else 'eng' |
|
82 |
|
|
64 | 83 |
configs = docData.getConfigs('Text Size', 'Min Text Size') |
65 | 84 |
minSize = int(configs[0].value) if 1 == len(configs) else 30 |
66 | 85 |
configs = docData.getConfigs('Text Size', 'Max Text Size') |
내보내기 Unified diff