Makefile 7.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214
  1. export
  2. SHELL := /bin/bash
  3. LOCAL := $(PWD)/usr
  4. PATH := $(LOCAL)/bin:$(PATH)
  5. TESSDATA = $(LOCAL)/share/tessdata
  6. # Name of the model to be built. Default: $(MODEL_NAME)
  7. MODEL_NAME = foo
  8. # Name of the model to continue from. Default: '$(START_MODEL)'
  9. START_MODEL =
  10. LAST_CHECKPOINT = data/checkpoints/$(MODEL_NAME)_checkpoint
  11. # Name of the proto model. Default: '$(PROTO_MODEL)'
  12. PROTO_MODEL = data/$(MODEL_NAME)/$(MODEL_NAME).traineddata
  13. # No of cores to use for compiling leptonica/tesseract. Default: $(CORES)
  14. CORES = 4
  15. # Leptonica version. Default: $(LEPTONICA_VERSION)
  16. LEPTONICA_VERSION := 1.75.3
  17. # Tesseract commit. Default: $(TESSERACT_VERSION)
  18. TESSERACT_VERSION := fd492062d08a2f55001a639f2015b8524c7e9ad4
  19. # Tesseract model repo to use. Default: $(TESSDATA_REPO)
  20. TESSDATA_REPO = _best
  21. # Ground truth directory. Default: $(GROUND_TRUTH_DIR)
  22. GROUND_TRUTH_DIR := data/ground-truth
  23. # Normalization Mode - see src/training/language_specific.sh for details. Default: $(NORM_MODE)
  24. NORM_MODE = 2
  25. # Page segmentation mode. Default: $(PSM)
  26. PSM = 6
  27. # Ratio of train / eval training data. Default: $(RATIO_TRAIN)
  28. RATIO_TRAIN := 0.90
  29. # BEGIN-EVAL makefile-parser --make-help Makefile
  30. help:
  31. @echo ""
  32. @echo " Targets"
  33. @echo ""
  34. @echo " unicharset Create unicharset"
  35. @echo " lists Create lists of lstmf filenames for training and eval"
  36. @echo " training Start training"
  37. @echo " proto-model Build the proto model"
  38. @echo " leptonica Build leptonica"
  39. @echo " tesseract Build tesseract"
  40. @echo " tesseract-langs Download tesseract-langs"
  41. @echo " clean Clean all generated files"
  42. @echo ""
  43. @echo " Variables"
  44. @echo ""
  45. @echo " MODEL_NAME Name of the model to be built. Default: $(MODEL_NAME)"
  46. @echo " START_MODEL Name of the model to continue from. Default: '$(START_MODEL)'"
  47. @echo " PROTO_MODEL Name of the proto model. Default: '$(PROTO_MODEL)'"
  48. @echo " CORES No of cores to use for compiling leptonica/tesseract. Default: $(CORES)"
  49. @echo " LEPTONICA_VERSION Leptonica version. Default: $(LEPTONICA_VERSION)"
  50. @echo " TESSERACT_VERSION Tesseract commit. Default: $(TESSERACT_VERSION)"
  51. @echo " TESSDATA_REPO Tesseract model repo to use. Default: $(TESSDATA_REPO)"
  52. @echo " GROUND_TRUTH_DIR Ground truth directory. Default: $(GROUND_TRUTH_DIR)"
  53. @echo " NORM_MODE Normalization Mode - see src/training/language_specific.sh for details. Default: $(NORM_MODE)"
  54. @echo " PSM Page segmentation mode. Default: $(PSM)"
  55. @echo " RATIO_TRAIN Ratio of train / eval training data. Default: $(RATIO_TRAIN)"
  56. # END-EVAL
  57. ALL_BOXES = data/all-boxes
  58. ALL_LSTMF = data/all-lstmf
  59. # Create unicharset
  60. unicharset: data/unicharset
  61. # Create lists of lstmf filenames for training and eval
  62. lists: $(ALL_LSTMF) data/list.train data/list.eval
  63. data/list.train: $(ALL_LSTMF)
  64. total=`cat $(ALL_LSTMF) | wc -l` \
  65. no=`echo "$$total * $(RATIO_TRAIN) / 1" | bc`; \
  66. head -n "$$no" $(ALL_LSTMF) > "$@"
  67. data/list.eval: $(ALL_LSTMF)
  68. total=`cat $(ALL_LSTMF) | wc -l` \
  69. no=`echo "($$total - $$total * $(RATIO_TRAIN)) / 1" | bc`; \
  70. tail -n "$$no" $(ALL_LSTMF) > "$@"
  71. # Start training
  72. training: data/$(MODEL_NAME).traineddata
  73. ifdef START_MODEL
  74. data/unicharset: $(ALL_BOXES)
  75. mkdir -p data/$(START_MODEL)
  76. combine_tessdata -u $(TESSDATA)/$(START_MODEL).traineddata data/$(START_MODEL)/$(START_MODEL)
  77. unicharset_extractor --output_unicharset "$(GROUND_TRUTH_DIR)/my.unicharset" --norm_mode $(NORM_MODE) "$(ALL_BOXES)"
  78. merge_unicharsets data/$(START_MODEL)/$(START_MODEL).lstm-unicharset $(GROUND_TRUTH_DIR)/my.unicharset "$@"
  79. else
  80. data/unicharset: $(ALL_BOXES)
  81. unicharset_extractor --output_unicharset "$@" --norm_mode 1 "$(ALL_BOXES)"
  82. endif
  83. $(ALL_BOXES): $(sort $(patsubst %.tif,%.box,$(wildcard $(GROUND_TRUTH_DIR)/*.tif)))
  84. find $(GROUND_TRUTH_DIR) -name '*.box' -exec cat {} \; > "$@"
  85. $(GROUND_TRUTH_DIR)/%.box: $(GROUND_TRUTH_DIR)/%.tif $(GROUND_TRUTH_DIR)/%.gt.txt
  86. python generate_line_box.py -i "$(GROUND_TRUTH_DIR)/$*.tif" -t "$(GROUND_TRUTH_DIR)/$*.gt.txt" > "$@"
  87. $(ALL_LSTMF): $(sort $(patsubst %.tif,%.lstmf,$(wildcard $(GROUND_TRUTH_DIR)/*.tif)))
  88. find $(GROUND_TRUTH_DIR) -name '*.lstmf' -exec echo {} \; | sort -R -o "$@"
  89. $(GROUND_TRUTH_DIR)/%.lstmf: $(GROUND_TRUTH_DIR)/%.box
  90. tesseract $(GROUND_TRUTH_DIR)/$*.tif $(GROUND_TRUTH_DIR)/$* --psm $(PSM) lstm.train
  91. # Build the proto model
  92. proto-model: $(PROTO_MODEL)
  93. $(PROTO_MODEL): data/unicharset data/radical-stroke.txt
  94. combine_lang_model \
  95. --input_unicharset data/unicharset \
  96. --script_dir data/ \
  97. --output_dir data/ \
  98. --lang $(MODEL_NAME)
  99. ifdef START_MODEL
  100. $(LAST_CHECKPOINT): unicharset lists $(PROTO_MODEL)
  101. mkdir -p data/checkpoints
  102. lstmtraining \
  103. --traineddata $(PROTO_MODEL) \
  104. --old_traineddata $(TESSDATA)/$(START_MODEL).traineddata \
  105. --continue_from data/$(START_MODEL)/$(START_MODEL).lstm \
  106. --net_spec "[1,36,0,1 Ct3,3,16 Mp3,3 Lfys48 Lfx96 Lrx96 Lfx256 O1c`head -n1 data/unicharset`]" \
  107. --model_output data/checkpoints/$(MODEL_NAME) \
  108. --learning_rate 20e-4 \
  109. --train_listfile data/list.train \
  110. --eval_listfile data/list.eval \
  111. --max_iterations 10000
  112. else
  113. $(LAST_CHECKPOINT): unicharset lists $(PROTO_MODEL)
  114. mkdir -p data/checkpoints
  115. lstmtraining \
  116. --traineddata $(PROTO_MODEL) \
  117. --net_spec "[1,36,0,1 Ct3,3,16 Mp3,3 Lfys48 Lfx96 Lrx96 Lfx256 O1c`head -n1 data/unicharset`]" \
  118. --model_output data/checkpoints/$(MODEL_NAME) \
  119. --learning_rate 20e-4 \
  120. --train_listfile data/list.train \
  121. --eval_listfile data/list.eval \
  122. --max_iterations 10000
  123. endif
  124. data/$(MODEL_NAME).traineddata: $(LAST_CHECKPOINT)
  125. lstmtraining \
  126. --stop_training \
  127. --continue_from $(LAST_CHECKPOINT) \
  128. --traineddata $(PROTO_MODEL) \
  129. --model_output $@
  130. data/radical-stroke.txt:
  131. wget -O$@ 'https://github.com/tesseract-ocr/langdata_lstm/raw/master/radical-stroke.txt'
  132. # Build leptonica
  133. leptonica: leptonica.built
  134. leptonica.built: leptonica-$(LEPTONICA_VERSION)
  135. cd $< ; \
  136. ./configure --prefix=$(LOCAL) && \
  137. make -j$(CORES) && \
  138. make install && \
  139. date > "$@"
  140. leptonica-$(LEPTONICA_VERSION): leptonica-$(LEPTONICA_VERSION).tar.gz
  141. tar xf "$<"
  142. leptonica-$(LEPTONICA_VERSION).tar.gz:
  143. wget 'http://www.leptonica.org/source/$@'
  144. # Build tesseract
  145. tesseract: tesseract.built tesseract-langs
  146. tesseract.built: tesseract-$(TESSERACT_VERSION)
  147. cd $< && \
  148. sh autogen.sh && \
  149. PKG_CONFIG_PATH="$(LOCAL)/lib/pkgconfig" \
  150. LEPTONICA_CFLAGS="-I$(LOCAL)/include/leptonica" \
  151. ./configure --prefix=$(LOCAL) && \
  152. LDFLAGS="-L$(LOCAL)/lib"\
  153. make -j$(CORES) && \
  154. make install && \
  155. make -j$(CORES) training-install && \
  156. date > "$@"
  157. tesseract-$(TESSERACT_VERSION):
  158. wget https://github.com/tesseract-ocr/tesseract/archive/$(TESSERACT_VERSION).zip
  159. unzip $(TESSERACT_VERSION).zip
  160. # Download tesseract-langs
  161. tesseract-langs: $(TESSDATA)/eng.traineddata
  162. $(TESSDATA)/eng.traineddata:
  163. cd $(TESSDATA) && wget https://github.com/tesseract-ocr/tessdata$(TESSDATA_REPO)/raw/master/$(notdir $@)
  164. # Clean all generated files
  165. clean:
  166. find $(GROUND_TRUTH_DIR) -name '*.box' -delete
  167. find $(GROUND_TRUTH_DIR) -name '*.lstmf' -delete
  168. rm -rf data/all-*
  169. rm -rf data/list.*
  170. rm -rf data/$(MODEL_NAME)
  171. rm -rf data/unicharset
  172. rm -rf data/checkpoints