# NOT RUN {
# Since the extraction of bg-en.tgz in Europarl corpus is time consuming,
# so the aforementioned unzip files have been temporarily exported to
# http://www.um.ac.ir/~sarmad/... .
# }
# NOT RUN {
aa1 = prepareData ('http://www.um.ac.ir/~sarmad/word.a/euro.bg',
'http://www.um.ac.ir/~sarmad/word.a/euro.en',
nrec = 20, encode.sorc = 'UTF-8')
aa2 = prepareData ('http://www.um.ac.ir/~sarmad/word.a/euro.bg',
'http://www.um.ac.ir/~sarmad/word.a/euro.en',
nrec = 20, encode.sorc = 'UTF-8', word_align = FALSE)
aa3 = prepareData ('http://www.um.ac.ir/~sarmad/word.a/euro.bg',
'http://www.um.ac.ir/~sarmad/word.a/euro.en',
nrec = 20, encode.sorc = 'UTF-8', removePt = FALSE)
# }
Run the code above in your browser using DataLab