diff --git a/app-text/tesseract_data/tesseract_data-3.04.00.recipe b/app-text/tesseract_data/tesseract_data-3.04.00.recipe new file mode 100644 index 000000000..96a8f9725 --- /dev/null +++ b/app-text/tesseract_data/tesseract_data-3.04.00.recipe @@ -0,0 +1,157 @@ +SUMMARY="Language data files for Tesseract OCR engine" +DESCRIPTION="Tesseract OCR can be fully trained to recognize new languages and scripts. \ +A set of files for, community made, trained languages are available as \ +separate packages per language." +HOMEPAGE="https://github.com/tesseract-ocr/" +LICENSE="Apache v2" +COPYRIGHT="1985-1995 HP labs + 2012 Google Inc." +REVISION="1" +SOURCE_URI="https://github.com/tesseract-ocr/tessdata/archive/$portVersion.tar.gz" +CHECKSUM_SHA256="5dcb37198336b6953843b461ee535df1401b41008d550fc9e43d0edabca7adb1" +SOURCE_DIR="tessdata-$portVersion" +DISABLE_SOURCE_PACKAGE=yes + +ARCHITECTURES="any" + +PROVIDES=" + $portName = $portVersion + " +BUILD_REQUIRES=" + " + +declare -A languages +# Special data files +languages[osd]="orientation and script detection" +languages[equ]="math / equation detection" + +# languages data files +languages[afr]="Afrikaans" +languages[amh]="Amharic" +languages[ara]="Arabic" +languages[asm]="Assamese" +languages[aze]="Azerbaijani" +languages[aze_cyrl]="Azerbaijani - Cyrilic" +languages[bel]="Belarusian" +languages[ben]="Bengali" +languages[bod]="Tibetan" +languages[bos]="Bosnian" +languages[bul]="Bulgarian" +languages[cat]="Catalan; Valencian" +languages[ceb]="Cebuano" +languages[ces]="Czech" +languages[chi_sim]="Chinese - Simplified" +languages[chi_tra]="Chinese - Traditional" +languages[chr]="Cherokee" +languages[cym]="Welsh" +languages[dan]="Danish" +languages[dan_frak]="Danish - Fraktur script" +languages[deu]="German" +languages[deu_frak]="Germain - Fraktur script" +languages[dzo]="Dzongkha" +languages[ell]="Greek, Modern (1453-)" +languages[eng]="English" +languages[enm]="English, Middle (1100-1500)" +languages[epo]="Esperanto" +languages[est]="Estonian" +languages[eus]="Basque" +languages[fas]="Persian" +languages[fin]="Finnish" +languages[fra]="French" +languages[frk]="Frankish" +languages[frm]="French, Middle (ca. 1400-1600)" +languages[gle]="Irish" +languages[glg]="Galician" +languages[grc]="Greek, Ancient (-1453)" +languages[guj]="Gujarati" +languages[hat]="Haitian; Haitian Creole" +languages[heb]="Hebrew" +languages[hin]="Hindi" +languages[hrv]="Croatian" +languages[hun]="Hungarian" +languages[iku]="Inuktitut" +languages[ind]="Indonesian" +languages[isl]="Icelandic" +languages[ita]="Italian" +languages[ita_old]="Italian - Old" +languages[jav]="Javanese" +languages[jpn]="Japanese" +languages[kan]="Kannada" +languages[kat]="Georgian" +languages[kat_old]="Georgian - Old" +languages[kaz]="Kazakh" +languages[khm]="Central Khmer" +languages[kir]="Kirghiz; Kyrgyz" +languages[kor]="Korean" +languages[kur]="Kurdish" +languages[lao]="Lao" +languages[lat]="Latin" +languages[lav]="Latvian" +languages[lit]="Lithuanian" +languages[mal]="Malayalam" +languages[mar]="Marathi" +languages[mkd]="Macedonian" +languages[mlt]="Maltese" +languages[msa]="Malay" +languages[mya]="Burmese" +languages[nep]="Nepali" +languages[nld]="Dutch; Flemish" +languages[nor]="Norvegian" +languages[ori]="Oriya" +languages[pan]="Panjabi; Punjabi" +languages[pol]="Polish" +languages[por]="Portuguese" +languages[pus]="Pushto; Pastho" +languages[ron]="Romanian; Moldavian; Moldovan" +languages[rus]="Russian" +languages[san]="Sanskrit" +languages[sin]="Sinhala; Sinhalese" +languages[slk]="Slovak" +languages[slk_frak]="Slovak - Fraktur script" +languages[slv]="Slovenian" +languages[spa]="Spanish; Castilian" +languages[spa_old]="Spanish; Castilian - Old" +languages[sqi]="Albanian" +languages[srp]="Serbian" +languages[srp_latn]="Serbian - Latin" +languages[swa]="Swahili" +languages[swe]="Swedish" +languages[syr]="Syriac" +languages[tam]="Tamil" +languages[tel]="Telugu" +languages[tgk]="Tajik" +languages[tgl]="Tagalog" +languages[tha]="Thai" +languages[tir]="Tigrinya" +languages[tur]="Turkish" +languages[uig]="Uighur; Uyghur" +languages[ukr]="Ukrainian" +languages[urd]="Urdu" +languages[uzb]="Uzbek" +languages[uzb_cyrl]="Uzbek - Cyrilic" +languages[vie]="Vietnamese" +languages[yid]="Yiddish" + +for lang in "${!languages[@]}"; do + desc=${languages[${lang}]} + + eval "\ + SUMMARY_${lang}=\"Data files for ${desc}\";\ + PROVIDES_${lang}=\"\ + ${portName}_${lang} = $portVersion\ + \"; \ + REQUIRES_${lang}=\"\ + haiku\n\ + tesseract >= 3\n\ + \"" +done + +INSTALL() +{ + mkdir -p $dataDir/tessdata + for lang in "${!languages[@]}"; do + cp $lang.* $dataDir/tessdata + packageEntries $lang \ + $dataDir/tessdata/$lang.* + done +}