Source code for lingvo.tasks.lm.tools.download_lm1b

# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Downloads and processes lm1b dataset (http://www.statmt.org/lm-benchmark)."""

import glob
import os
import lingvo.compat as tf

tf.flags.DEFINE_string("outdir", "/tmp/lm1b", "The output directory.")
tf.flags.DEFINE_integer(
    "count_cutoff", 3, "Ignore tokens that appear fewer than "
    "this amount of times when creating the vocab file.")

FLAGS = tf.flags.FLAGS


[docs]def main(_):
  basename = "1-billion-word-language-modeling-benchmark-r13output"
  fname = basename + ".tar.gz"
  url = "http://www.statmt.org/lm-benchmark/" + fname
  sha256hash = "01ba60381110baf7f189dfd2b8374de371e8c9a340835793f190bdae9e90a34e"

  tf.keras.utils.get_file(
      fname, url, file_hash=sha256hash, cache_subdir=FLAGS.outdir, extract=True)

  tf.logging.info("Generating vocab file. This may take a few minutes.")
  vocab = {}
  for fname in glob.glob(
      os.path.join(FLAGS.outdir, basename,
                   "training-monolingual.tokenized.shuffled", "news.en*")):
    with open(fname) as f:
      for line in f:
        for w in line.split():
          vocab[w] = vocab.get(w, 0) + 1

  with open(os.path.join(FLAGS.outdir, basename, "vocab.txt"), "w") as f:
    f.write("<epsilon>\t0\n<S>\t1\n</S>\t2\n<UNK>\t3\n")
    id = 4
    for k, v in sorted(list(vocab.items()), key=lambda kv: (-kv[1], kv[0])):
      if v < FLAGS.count_cutoff:
        break
      f.write("%s\t%d\n" % (k, id))
      id += 1


if __name__ == "__main__":
  tf.app.run(main)