# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Generates the words_to_ids file from a BPE encoded corpus and BPE vocab file.
Extracts all the words in the corpus with their corresponding list of ids. Each
subword in the vocab file is mapped to their line number as its id. The lines of
the output file are like:
...
TAKE 43,7,50,14
THAT 16,35
THE 26
THEIR 16,4,9,56
...
Which is compatible with the BPE tokenizer op in core/tokenizer.py.
Typical workflow:
subword-nmt learn-bpe train_file code_file
subword-nmt apply-bpe code_file train_file train_bpe_file
subword-nmt get-vocab train_bpe_file vocab_file
bpe_word_tokenizer train_bpe_file vocab_file words_to_ids_file
"""
import lingvo.compat as tf
tf.flags.DEFINE_string('encoded_filepath', '',
'Path to the BPE encoded corpus file.')
tf.flags.DEFINE_string('vocab_filepath', '', 'Path to the BPE vocab file.')
tf.flags.DEFINE_string('output_filepath', '',
'The output filepath (word_to_ids).')
FLAGS = tf.flags.FLAGS
[docs]def _GetVocabulary(vocab_filepath):
"""Maps the first word in each line of the given file to its line number."""
vocab = {}
with open(vocab_filepath, 'r') as vocab_file:
for i, line in enumerate(vocab_file):
word = line.strip('\r\n ').split(' ')[0]
if word:
vocab[word] = i
return vocab
[docs]def main(_):
vocab = _GetVocabulary(FLAGS.vocab_filepath)
word_tokenization = _ExtractTokenization(FLAGS.encoded_filepath, vocab)
with open(FLAGS.output_filepath, 'w') as output:
for word, ids in word_tokenization.items():
output.write(word + ' ')
output.write(','.join(map(str, ids)))
output.write('\r\n')
if __name__ == '__main__':
tf.app.run(main)