개정판 aec5980d
build issue #655: tesseract moved programdata
DTI_PID/DTI_PID/TrainingImageListDialog.py | ||
12 | 12 |
import TrainingImageList_UI |
13 | 13 |
from TrainingEditorDialog import QTrainingEditorDialog |
14 | 14 |
15 |
runningPath = os.getcwd() |
16 |
###tesseractPath = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'Tesseract-OCR', 'tessdata') |
17 |
tesseractPath = os.path.join('C:\\ProgramData\\Digital PID', 'Tesseract-OCR', 'tessdata')### |
18 |
###pytesseract.pytesseract.tesseract_cmd = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'Tesseract-OCR', 'tesseract.exe') |
19 |
pytesseract.pytesseract.tesseract_cmd = os.path.join('C:\\ProgramData\\Digital PID', 'Tesseract-OCR', 'tesseract.exe')### |
20 |
###tesseract_cmd = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'Tesseract-OCR', 'tesseract.exe') |
21 |
tesseract_cmd = os.path.join('C:\\ProgramData\\Digital PID', 'Tesseract-OCR', 'tesseract.exe')### |
22 |
###unicharset_extractor_cmd = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'Tesseract-OCR', 'unicharset_extractor.exe') |
23 |
unicharset_extractor_cmd = os.path.join('C:\\ProgramData\\Digital PID', 'Tesseract-OCR', 'unicharset_extractor.exe')### |
24 |
set_unicharset_properties_cmd = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'Tesseract-OCR', 'set_unicharset_properties.exe') |
15 |
dataPath = os.path.join(os.getenv('ALLUSERSPROFILE'), 'Digital PID') |
16 |
tesseractPath = os.path.join(dataPath, 'Tesseract-OCR', 'tessdata') |
17 |
pytesseract.pytesseract.tesseract_cmd = os.path.join(dataPath, 'Tesseract-OCR', 'tesseract.exe') |
18 |
tesseract_cmd = os.path.join(dataPath, 'Tesseract-OCR', 'tesseract.exe') |
19 |
unicharset_extractor_cmd = os.path.join(dataPath, 'Tesseract-OCR', 'unicharset_extractor.exe') |
20 |
set_unicharset_properties_cmd = os.path.join(dataPath, 'Tesseract-OCR', 'set_unicharset_properties.exe') |
25 | 21 |
#langDataPath = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'Tesseract-OCR', 'set_unicharset_properties.exe') |
26 |
shapeclustering_cmd = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'Tesseract-OCR', 'shapeclustering.exe')
27 |
mftraining_cmd = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'Tesseract-OCR', 'mftraining.exe')
28 |
cntraining_cmd = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'Tesseract-OCR', 'cntraining.exe')
29 |
combine_tessdata_cmd = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'Tesseract-OCR', 'combine_tessdata.exe')
22 |
shapeclustering_cmd = os.path.join(dataPath, 'Tesseract-OCR', 'shapeclustering.exe')
23 |
mftraining_cmd = os.path.join(dataPath, 'Tesseract-OCR', 'mftraining.exe')
24 |
cntraining_cmd = os.path.join(dataPath, 'Tesseract-OCR', 'cntraining.exe')
25 |
combine_tessdata_cmd = os.path.join(dataPath, 'Tesseract-OCR', 'combine_tessdata.exe')
30 | 26 |
31 | 27 |
32 | 28 |
--psm 6 -c tessedit_char_whitelist=ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-~.,/!@#$%&*(){}[]<>:;+=?\\"\\' |
... | ... | |
171 | 167 |
from PIL import Image |
172 | 168 |
import math |
173 | 169 |
try: |
174 |
###os.environ['TESSDATA_PREFIX'] = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'Tesseract-OCR') |
175 |
os.environ['TESSDATA_PREFIX'] = os.path.join('C:\ProgramData\Digital PID', 'Tesseract-OCR')### |
170 |
os.environ['TESSDATA_PREFIX'] = os.path.join(dataPath, 'Tesseract-OCR') |
176 | 171 |
appDocData = AppDocData.instance() |
177 | 172 |
project = appDocData.getCurrentProject() |
178 | 173 |
self.oCRLang = appDocData.getCurrentProject().getName() |
... | ... | |
294 | 289 |
return None |
295 | 290 |
296 | 291 |
try: |
292 |
originPath = os.getcwd() |
293 |
os.chdir(dataPath) |
297 | 294 |
# 1 |
298 | 295 |
trainingImgPath = self.get_short_path_name(trainingImgPath) |
299 |
trainingBoxPathT = self.get_short_path_name(trainingBoxPath.replace('.box', ''))
296 |
trainingBoxPathT = trainingImgPath.replace('.tif', '')
300 | 297 |
trainCmd = '\"' + tesseract_cmd + '\" ' + trainingImgPath + ' ' + trainingBoxPathT + ' nobatch box.train'# &timeout 15' |
301 | 298 |
subprocess.call(trainCmd, shell = True) |
302 | 299 |
... | ... | |
304 | 301 |
trainingBoxPathU = self.get_short_path_name(trainingBoxPath) |
305 | 302 |
unicharsetExtractorCmd = '\"' + unicharset_extractor_cmd + '\"' + ' ' + trainingBoxPathU# + ' &timeout 15' |
306 | 303 |
subprocess.call(unicharsetExtractorCmd, shell = True) |
307 |
''' |
304 |
308 | 305 |
# 3 |
309 |
inputUnicharset = os.path.join(runningPath, 'unicharset')
306 |
inputUnicharset = os.path.join(dataPath, 'unicharset')
310 | 307 |
inputUnicharset = self.get_short_path_name(inputUnicharset) |
311 | 308 |
scriptPath = ' --script_dir=//langdata-master' |
312 | 309 |
setUnicharsetPropertiesCmd = '\"' + set_unicharset_properties_cmd + '\" -U ' + inputUnicharset + ' -O ' + inputUnicharset + scriptPath# + ' &timeout 15' |
... | ... | |
319 | 316 |
fw.close() |
320 | 317 |
321 | 318 |
# 5 |
322 |
trPath = os.path.join(project.getTrainingFilePath(), self.oCRLang + '.' + self.oCRLang + 'F.exp0.tr') |
323 |
trPath = self.get_short_path_name(trPath) |
319 |
trPath = trainingImgPath.replace('.tif', '.tr') |
324 | 320 |
fontProperty = self.get_short_path_name(fontProperty) |
325 | 321 |
shapeclusteringCmd = '\"' + shapeclustering_cmd + '\" -F ' + fontProperty + ' -U ' + inputUnicharset + ' ' + trPath# + ' &timeout 15' |
326 | 322 |
subprocess.call(shapeclusteringCmd, shell = True) |
... | ... | |
335 | 331 |
336 | 332 |
self.deleteMidProcessFile() |
337 | 333 |
338 |
os.rename(os.path.join(runningPath, 'inttemp'), os.path.join(runningPath, self.oCRLang + '.inttemp'))
339 |
os.rename(os.path.join(runningPath, 'normproto'), os.path.join(runningPath, self.oCRLang + '.normproto'))
340 |
os.rename(os.path.join(runningPath, 'pffmtable'), os.path.join(runningPath, self.oCRLang + '.pffmtable'))
341 |
os.rename(os.path.join(runningPath, 'shapetable'), os.path.join(runningPath, self.oCRLang + '.shapetable'))
342 |
os.rename(os.path.join(runningPath, 'unicharset'), os.path.join(runningPath, self.oCRLang + '.unicharset'))
334 |
os.rename(os.path.join(dataPath, 'inttemp'), os.path.join(dataPath, self.oCRLang + '.inttemp'))
335 |
os.rename(os.path.join(dataPath, 'normproto'), os.path.join(dataPath, self.oCRLang + '.normproto'))
336 |
os.rename(os.path.join(dataPath, 'pffmtable'), os.path.join(dataPath, self.oCRLang + '.pffmtable'))
337 |
os.rename(os.path.join(dataPath, 'shapetable'), os.path.join(dataPath, self.oCRLang + '.shapetable'))
338 |
os.rename(os.path.join(dataPath, 'unicharset'), os.path.join(dataPath, self.oCRLang + '.unicharset'))
343 | 339 |
# 8 |
344 | 340 |
combineTessdataCmd = '\"' + combine_tessdata_cmd + '\" ' + self.oCRLang + '.' |
345 | 341 |
subprocess.call(combineTessdataCmd, shell = True) |
346 | 342 |
347 | 343 |
if os.path.isfile(os.path.join(tesseractPath, self.oCRLang + '.traineddata')): |
348 | 344 |
os.remove(os.path.join(tesseractPath, self.oCRLang + '.traineddata')) |
349 |
os.rename(os.path.join(runningPath, self.oCRLang + '.traineddata'), os.path.join(tesseractPath, self.oCRLang + '.traineddata'))
345 |
os.rename(os.path.join(dataPath, self.oCRLang + '.traineddata'), os.path.join(tesseractPath, self.oCRLang + '.traineddata'))
350 | 346 |
351 | 347 |
self.deleteMidProcessFile() |
352 |
''' |
348 |
353 | 349 |
except Exception as ex: |
354 | 350 |
print('error occured({}) in {}:{}'.format(ex, sys.exc_info()[-1].tb_frame.f_code.co_filename, sys.exc_info()[-1].tb_lineno)) |
355 | 351 |
from App import App |
... | ... | |
359 | 355 |
App.mainWnd().addMessage.emit(MessageType.Error, message) |
360 | 356 |
finally: |
361 | 357 |
self.deleteMidProcessFile() |
358 |
os.chdir(originPath) |
362 | 359 |
363 | 360 |
''' |
364 | 361 |
@brief delete Mid Process File |
... | ... | |
367 | 364 |
''' |
368 | 365 |
def deleteMidProcessFile(self): |
369 | 366 |
try: |
370 |
if os.path.isfile(os.path.join(runningPath, self.oCRLang + '.inttemp')):
371 |
os.remove(os.path.join(runningPath, self.oCRLang + '.inttemp'))
372 |
if os.path.isfile(os.path.join(runningPath, self.oCRLang + '.normproto')):
373 |
os.remove(os.path.join(runningPath, self.oCRLang + '.normproto'))
374 |
if os.path.isfile(os.path.join(runningPath, self.oCRLang + '.pffmtable')):
375 |
os.remove(os.path.join(runningPath, self.oCRLang + '.pffmtable'))
376 |
if os.path.isfile(os.path.join(runningPath, self.oCRLang + '.shapetable')):
377 |
os.remove(os.path.join(runningPath, self.oCRLang + '.shapetable'))
378 |
if os.path.isfile(os.path.join(runningPath, self.oCRLang + '.unicharset')):
379 |
os.remove(os.path.join(runningPath, self.oCRLang + '.unicharset'))
367 |
if os.path.isfile(os.path.join(dataPath, self.oCRLang + '.inttemp')):
368 |
os.remove(os.path.join(dataPath, self.oCRLang + '.inttemp'))
369 |
if os.path.isfile(os.path.join(dataPath, self.oCRLang + '.normproto')):
370 |
os.remove(os.path.join(dataPath, self.oCRLang + '.normproto'))
371 |
if os.path.isfile(os.path.join(dataPath, self.oCRLang + '.pffmtable')):
372 |
os.remove(os.path.join(dataPath, self.oCRLang + '.pffmtable'))
373 |
if os.path.isfile(os.path.join(dataPath, self.oCRLang + '.shapetable')):
374 |
os.remove(os.path.join(dataPath, self.oCRLang + '.shapetable'))
375 |
if os.path.isfile(os.path.join(dataPath, self.oCRLang + '.unicharset')):
376 |
os.remove(os.path.join(dataPath, self.oCRLang + '.unicharset'))
380 | 377 |
except Exception as ex: |
381 | 378 |
print('error occured({}) in {}:{}'.format(ex, sys.exc_info()[-1].tb_frame.f_code.co_filename, sys.exc_info()[-1].tb_lineno)) |
382 | 379 |
from App import App |
DTI_PID/DTI_PID/tesseract_ocr_module.py | ||
32 | 32 |
humkyung 2018.08.13 set tesseract executable path to relative of this file path |
33 | 33 |
''' |
34 | 34 |
#pytesseract.pytesseract.tesseract_cmd = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'Tesseract-OCR', 'tesseract.exe') |
35 |
pytesseract.pytesseract.tesseract_cmd = os.path.join('C:\ProgramData\Digital PID', 'Tesseract-OCR', 'tesseract.exe') |
36 |
#tesseract_cmd = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'Tesseract-OCR', 'tesseract.exe') |
35 |
pytesseract.pytesseract.tesseract_cmd = os.path.join(os.getenv('ALLUSERSPROFILE'), 'Digital PID', 'Tesseract-OCR', 'tesseract.exe') |
37 | 36 |
38 | 37 |
39 | 38 |
--psm 6 -c tessedit_char_whitelist=abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-~.,/!@#$%&*(){}[]<>:;+=?\\" |
... | ... | |
189 | 188 |
""" |
190 | 189 |
def getTextInfo(img, startPoint, angle = 0, flag = FLAG_IMAGE_TO_BOXES, conf = DEFAULT_CONF): |
191 | 190 |
try: |
192 |
###os.environ['TESSDATA_PREFIX'] = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'Tesseract-OCR') |
193 |
os.environ['TESSDATA_PREFIX'] = os.path.join('C:\ProgramData\Digital PID', 'Tesseract-OCR')### |
191 |
os.environ['TESSDATA_PREFIX'] = os.path.join(os.getenv('ALLUSERSPROFILE'), 'Digital PID', 'Tesseract-OCR') |
194 | 192 |
textInfoList = [] |
195 | 193 |
196 | 194 |
docData = AppDocData.instance() |
내보내기 Unified diff