개정판 aec5980d
build issue #655: tesseract moved programdata
DTI_PID/DTI_PID/TrainingImageListDialog.py | ||
---|---|---|
12 | 12 |
import TrainingImageList_UI |
13 | 13 |
from TrainingEditorDialog import QTrainingEditorDialog |
14 | 14 |
|
15 |
runningPath = os.getcwd() |
|
16 |
###tesseractPath = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'Tesseract-OCR', 'tessdata') |
|
17 |
tesseractPath = os.path.join('C:\\ProgramData\\Digital PID', 'Tesseract-OCR', 'tessdata')### |
|
18 |
###pytesseract.pytesseract.tesseract_cmd = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'Tesseract-OCR', 'tesseract.exe') |
|
19 |
pytesseract.pytesseract.tesseract_cmd = os.path.join('C:\\ProgramData\\Digital PID', 'Tesseract-OCR', 'tesseract.exe')### |
|
20 |
###tesseract_cmd = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'Tesseract-OCR', 'tesseract.exe') |
|
21 |
tesseract_cmd = os.path.join('C:\\ProgramData\\Digital PID', 'Tesseract-OCR', 'tesseract.exe')### |
|
22 |
###unicharset_extractor_cmd = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'Tesseract-OCR', 'unicharset_extractor.exe') |
|
23 |
unicharset_extractor_cmd = os.path.join('C:\\ProgramData\\Digital PID', 'Tesseract-OCR', 'unicharset_extractor.exe')### |
|
24 |
set_unicharset_properties_cmd = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'Tesseract-OCR', 'set_unicharset_properties.exe') |
|
15 |
dataPath = os.path.join(os.getenv('ALLUSERSPROFILE'), 'Digital PID') |
|
16 |
tesseractPath = os.path.join(dataPath, 'Tesseract-OCR', 'tessdata') |
|
17 |
pytesseract.pytesseract.tesseract_cmd = os.path.join(dataPath, 'Tesseract-OCR', 'tesseract.exe') |
|
18 |
tesseract_cmd = os.path.join(dataPath, 'Tesseract-OCR', 'tesseract.exe') |
|
19 |
unicharset_extractor_cmd = os.path.join(dataPath, 'Tesseract-OCR', 'unicharset_extractor.exe') |
|
20 |
set_unicharset_properties_cmd = os.path.join(dataPath, 'Tesseract-OCR', 'set_unicharset_properties.exe') |
|
25 | 21 |
#langDataPath = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'Tesseract-OCR', 'set_unicharset_properties.exe') |
26 |
shapeclustering_cmd = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'Tesseract-OCR', 'shapeclustering.exe')
|
|
27 |
mftraining_cmd = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'Tesseract-OCR', 'mftraining.exe')
|
|
28 |
cntraining_cmd = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'Tesseract-OCR', 'cntraining.exe')
|
|
29 |
combine_tessdata_cmd = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'Tesseract-OCR', 'combine_tessdata.exe')
|
|
22 |
shapeclustering_cmd = os.path.join(dataPath, 'Tesseract-OCR', 'shapeclustering.exe')
|
|
23 |
mftraining_cmd = os.path.join(dataPath, 'Tesseract-OCR', 'mftraining.exe')
|
|
24 |
cntraining_cmd = os.path.join(dataPath, 'Tesseract-OCR', 'cntraining.exe')
|
|
25 |
combine_tessdata_cmd = os.path.join(dataPath, 'Tesseract-OCR', 'combine_tessdata.exe')
|
|
30 | 26 |
|
31 | 27 |
DEFAULT_CONF = """ |
32 | 28 |
--psm 6 -c tessedit_char_whitelist=ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-~.,/!@#$%&*(){}[]<>:;+=?\\"\\' |
... | ... | |
171 | 167 |
from PIL import Image |
172 | 168 |
import math |
173 | 169 |
try: |
174 |
###os.environ['TESSDATA_PREFIX'] = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'Tesseract-OCR') |
|
175 |
os.environ['TESSDATA_PREFIX'] = os.path.join('C:\ProgramData\Digital PID', 'Tesseract-OCR')### |
|
170 |
os.environ['TESSDATA_PREFIX'] = os.path.join(dataPath, 'Tesseract-OCR') |
|
176 | 171 |
appDocData = AppDocData.instance() |
177 | 172 |
project = appDocData.getCurrentProject() |
178 | 173 |
self.oCRLang = appDocData.getCurrentProject().getName() |
... | ... | |
294 | 289 |
return None |
295 | 290 |
|
296 | 291 |
try: |
292 |
originPath = os.getcwd() |
|
293 |
os.chdir(dataPath) |
|
297 | 294 |
# 1 |
298 | 295 |
trainingImgPath = self.get_short_path_name(trainingImgPath) |
299 |
trainingBoxPathT = self.get_short_path_name(trainingBoxPath.replace('.box', ''))
|
|
296 |
trainingBoxPathT = trainingImgPath.replace('.tif', '')
|
|
300 | 297 |
trainCmd = '\"' + tesseract_cmd + '\" ' + trainingImgPath + ' ' + trainingBoxPathT + ' nobatch box.train'# &timeout 15' |
301 | 298 |
subprocess.call(trainCmd, shell = True) |
302 | 299 |
|
... | ... | |
304 | 301 |
trainingBoxPathU = self.get_short_path_name(trainingBoxPath) |
305 | 302 |
unicharsetExtractorCmd = '\"' + unicharset_extractor_cmd + '\"' + ' ' + trainingBoxPathU# + ' &timeout 15' |
306 | 303 |
subprocess.call(unicharsetExtractorCmd, shell = True) |
307 |
''' |
|
304 |
|
|
308 | 305 |
# 3 |
309 |
inputUnicharset = os.path.join(runningPath, 'unicharset')
|
|
306 |
inputUnicharset = os.path.join(dataPath, 'unicharset')
|
|
310 | 307 |
inputUnicharset = self.get_short_path_name(inputUnicharset) |
311 | 308 |
scriptPath = ' --script_dir=//langdata-master' |
312 | 309 |
setUnicharsetPropertiesCmd = '\"' + set_unicharset_properties_cmd + '\" -U ' + inputUnicharset + ' -O ' + inputUnicharset + scriptPath# + ' &timeout 15' |
... | ... | |
319 | 316 |
fw.close() |
320 | 317 |
|
321 | 318 |
# 5 |
322 |
trPath = os.path.join(project.getTrainingFilePath(), self.oCRLang + '.' + self.oCRLang + 'F.exp0.tr') |
|
323 |
trPath = self.get_short_path_name(trPath) |
|
319 |
trPath = trainingImgPath.replace('.tif', '.tr') |
|
324 | 320 |
fontProperty = self.get_short_path_name(fontProperty) |
325 | 321 |
shapeclusteringCmd = '\"' + shapeclustering_cmd + '\" -F ' + fontProperty + ' -U ' + inputUnicharset + ' ' + trPath# + ' &timeout 15' |
326 | 322 |
subprocess.call(shapeclusteringCmd, shell = True) |
... | ... | |
335 | 331 |
|
336 | 332 |
self.deleteMidProcessFile() |
337 | 333 |
|
338 |
os.rename(os.path.join(runningPath, 'inttemp'), os.path.join(runningPath, self.oCRLang + '.inttemp'))
|
|
339 |
os.rename(os.path.join(runningPath, 'normproto'), os.path.join(runningPath, self.oCRLang + '.normproto'))
|
|
340 |
os.rename(os.path.join(runningPath, 'pffmtable'), os.path.join(runningPath, self.oCRLang + '.pffmtable'))
|
|
341 |
os.rename(os.path.join(runningPath, 'shapetable'), os.path.join(runningPath, self.oCRLang + '.shapetable'))
|
|
342 |
os.rename(os.path.join(runningPath, 'unicharset'), os.path.join(runningPath, self.oCRLang + '.unicharset'))
|
|
334 |
os.rename(os.path.join(dataPath, 'inttemp'), os.path.join(dataPath, self.oCRLang + '.inttemp'))
|
|
335 |
os.rename(os.path.join(dataPath, 'normproto'), os.path.join(dataPath, self.oCRLang + '.normproto'))
|
|
336 |
os.rename(os.path.join(dataPath, 'pffmtable'), os.path.join(dataPath, self.oCRLang + '.pffmtable'))
|
|
337 |
os.rename(os.path.join(dataPath, 'shapetable'), os.path.join(dataPath, self.oCRLang + '.shapetable'))
|
|
338 |
os.rename(os.path.join(dataPath, 'unicharset'), os.path.join(dataPath, self.oCRLang + '.unicharset'))
|
|
343 | 339 |
# 8 |
344 | 340 |
combineTessdataCmd = '\"' + combine_tessdata_cmd + '\" ' + self.oCRLang + '.' |
345 | 341 |
subprocess.call(combineTessdataCmd, shell = True) |
346 | 342 |
|
347 | 343 |
if os.path.isfile(os.path.join(tesseractPath, self.oCRLang + '.traineddata')): |
348 | 344 |
os.remove(os.path.join(tesseractPath, self.oCRLang + '.traineddata')) |
349 |
os.rename(os.path.join(runningPath, self.oCRLang + '.traineddata'), os.path.join(tesseractPath, self.oCRLang + '.traineddata'))
|
|
345 |
os.rename(os.path.join(dataPath, self.oCRLang + '.traineddata'), os.path.join(tesseractPath, self.oCRLang + '.traineddata'))
|
|
350 | 346 |
|
351 | 347 |
self.deleteMidProcessFile() |
352 |
''' |
|
348 |
|
|
353 | 349 |
except Exception as ex: |
354 | 350 |
print('error occured({}) in {}:{}'.format(ex, sys.exc_info()[-1].tb_frame.f_code.co_filename, sys.exc_info()[-1].tb_lineno)) |
355 | 351 |
from App import App |
... | ... | |
359 | 355 |
App.mainWnd().addMessage.emit(MessageType.Error, message) |
360 | 356 |
finally: |
361 | 357 |
self.deleteMidProcessFile() |
358 |
os.chdir(originPath) |
|
362 | 359 |
|
363 | 360 |
''' |
364 | 361 |
@brief delete Mid Process File |
... | ... | |
367 | 364 |
''' |
368 | 365 |
def deleteMidProcessFile(self): |
369 | 366 |
try: |
370 |
if os.path.isfile(os.path.join(runningPath, self.oCRLang + '.inttemp')):
|
|
371 |
os.remove(os.path.join(runningPath, self.oCRLang + '.inttemp'))
|
|
372 |
if os.path.isfile(os.path.join(runningPath, self.oCRLang + '.normproto')):
|
|
373 |
os.remove(os.path.join(runningPath, self.oCRLang + '.normproto'))
|
|
374 |
if os.path.isfile(os.path.join(runningPath, self.oCRLang + '.pffmtable')):
|
|
375 |
os.remove(os.path.join(runningPath, self.oCRLang + '.pffmtable'))
|
|
376 |
if os.path.isfile(os.path.join(runningPath, self.oCRLang + '.shapetable')):
|
|
377 |
os.remove(os.path.join(runningPath, self.oCRLang + '.shapetable'))
|
|
378 |
if os.path.isfile(os.path.join(runningPath, self.oCRLang + '.unicharset')):
|
|
379 |
os.remove(os.path.join(runningPath, self.oCRLang + '.unicharset'))
|
|
367 |
if os.path.isfile(os.path.join(dataPath, self.oCRLang + '.inttemp')):
|
|
368 |
os.remove(os.path.join(dataPath, self.oCRLang + '.inttemp'))
|
|
369 |
if os.path.isfile(os.path.join(dataPath, self.oCRLang + '.normproto')):
|
|
370 |
os.remove(os.path.join(dataPath, self.oCRLang + '.normproto'))
|
|
371 |
if os.path.isfile(os.path.join(dataPath, self.oCRLang + '.pffmtable')):
|
|
372 |
os.remove(os.path.join(dataPath, self.oCRLang + '.pffmtable'))
|
|
373 |
if os.path.isfile(os.path.join(dataPath, self.oCRLang + '.shapetable')):
|
|
374 |
os.remove(os.path.join(dataPath, self.oCRLang + '.shapetable'))
|
|
375 |
if os.path.isfile(os.path.join(dataPath, self.oCRLang + '.unicharset')):
|
|
376 |
os.remove(os.path.join(dataPath, self.oCRLang + '.unicharset'))
|
|
380 | 377 |
except Exception as ex: |
381 | 378 |
print('error occured({}) in {}:{}'.format(ex, sys.exc_info()[-1].tb_frame.f_code.co_filename, sys.exc_info()[-1].tb_lineno)) |
382 | 379 |
from App import App |
DTI_PID/DTI_PID/tesseract_ocr_module.py | ||
---|---|---|
32 | 32 |
humkyung 2018.08.13 set tesseract executable path to relative of this file path |
33 | 33 |
''' |
34 | 34 |
#pytesseract.pytesseract.tesseract_cmd = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'Tesseract-OCR', 'tesseract.exe') |
35 |
pytesseract.pytesseract.tesseract_cmd = os.path.join('C:\ProgramData\Digital PID', 'Tesseract-OCR', 'tesseract.exe') |
|
36 |
#tesseract_cmd = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'Tesseract-OCR', 'tesseract.exe') |
|
35 |
pytesseract.pytesseract.tesseract_cmd = os.path.join(os.getenv('ALLUSERSPROFILE'), 'Digital PID', 'Tesseract-OCR', 'tesseract.exe') |
|
37 | 36 |
|
38 | 37 |
DEFAULT_CONF = """ |
39 | 38 |
--psm 6 -c tessedit_char_whitelist=abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-~.,/!@#$%&*(){}[]<>:;+=?\\" |
... | ... | |
189 | 188 |
""" |
190 | 189 |
def getTextInfo(img, startPoint, angle = 0, flag = FLAG_IMAGE_TO_BOXES, conf = DEFAULT_CONF): |
191 | 190 |
try: |
192 |
###os.environ['TESSDATA_PREFIX'] = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'Tesseract-OCR') |
|
193 |
os.environ['TESSDATA_PREFIX'] = os.path.join('C:\ProgramData\Digital PID', 'Tesseract-OCR')### |
|
191 |
os.environ['TESSDATA_PREFIX'] = os.path.join(os.getenv('ALLUSERSPROFILE'), 'Digital PID', 'Tesseract-OCR') |
|
194 | 192 |
textInfoList = [] |
195 | 193 |
|
196 | 194 |
docData = AppDocData.instance() |
내보내기 Unified diff