Source code for lingvo.tasks.punctuator.tools.download_brown_corpus

# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Downloads and processes the Brown Corpus (http://www.nltk.org/nltk_data)."""

import os
import random
import string
from xml.etree import ElementTree
import lingvo.compat as tf

tf.flags.DEFINE_string("outdir", "/tmp/punctuator_data",
                       "The output directory.")

FLAGS = tf.flags.FLAGS


[docs]def main(_): tf.logging.set_verbosity(tf.logging.INFO) basename = "brown_tei" fname = basename + ".zip" url = ("https://raw.githubusercontent.com/nltk/nltk_data/" + "gh-pages/packages/corpora/" + fname) sha256 = "335bec1ea6362751d5d5c46970137ebb01c80bf7d7d75558787729d275e0a687" tf.keras.utils.get_file( fname, url, file_hash=sha256, cache_subdir=FLAGS.outdir, extract=True) tf.logging.info("\nDownload completed. Preprocessing...") with open(os.path.join(FLAGS.outdir, basename, "Corpus.xml"), "r") as xml: root = ElementTree.fromstring(xml.read().replace( 'xmlns="http://www.tei-c.org/ns/1.0"', "")) sentences = [] for sentence in root.findall("./TEI/text/body/p/s"): # Example input sentence: # <s n="1"><w type="AT">The</w> <w subtype="TL" type="NP">Fulton</w> # <w subtype="TL" type="NN">County</w> <w subtype="TL" type="JJ">Grand</w> # <w subtype="TL" type="NN">Jury</w> <w type="VBD">said</w> # <w type="NR">Friday</w> <w type="AT">an</w> <w type="NN">investigation</w> # <w type="IN">of</w> <w type="NPg">Atlanta's</w> <w type="JJ">recent</w> # <w type="NN">primary</w> <w type="NN">election</w> # <w type="VBD">produced</w> <c type="pct">``</c> <w type="AT">no</w> # <w type="NN">evidence</w> <c type="pct">''</c> <w type="CS">that</w> # <w type="DTI">any</w> <w type="NNS">irregularities</w> # <w type="VBD">took</w> <w type="NN">place</w> <c type="pct">.</c> </s> # Example output text: # The Fulton County Grand Jury said Friday an investigation of Atlanta's # recent primary election produced "no evidence" that any irregularities # took place. text = "" prepend_space = False for child in sentence: if child.tag == "w": if prepend_space: text += " " text += child.text prepend_space = True elif child.tag == "c": if child.text == "``": if prepend_space: text += " " text += '"' prepend_space = False elif child.text == "''": text += '"' prepend_space = True elif child.text == "'": if prepend_space: text += " '" prepend_space = False else: text += "'" prepend_space = True elif child.text == "(" or child.text == "[": if prepend_space: text += " " text += child.text prepend_space = False elif child.text == "-" or child.text == "--": if prepend_space: text += " " text += child.text prepend_space = True else: text += child.text prepend_space = True text = text.replace("!!", "!").replace("??", "?").replace("--", "-") text = text.replace("**", "*").replace(";;", ";").replace("::", ":") text = text.replace(",,", ",") # Filter out bad sentences. if not set(text) & set(string.ascii_letters): # No letters. continue if text.count('"') % 2 != 0: # Uneven number of quotes. continue if text.count("(") != text.count(")") or text.count("[") != text.count("]"): # Unbalanced parenthesis. continue if (text[0] == '"' and text[-1] == '"' or text[0] == "(" and text[-1] == ")" or text[0] == "[" and text[-1] == "]"): text = text[1:-1] if text[0] not in string.ascii_letters and text[0] not in string.digits: # Doesn't start with a letter or number. continue text = text[:1].upper() + text[1:] sentences.append(text) sentences = sorted(set(sentences)) random.seed(1234) random.shuffle(sentences) with open(os.path.join(FLAGS.outdir, "train.txt"), "w") as f: for line in sentences[:int(len(sentences) * 0.95)]: f.write("%s\n" % line) with open(os.path.join(FLAGS.outdir, "test.txt"), "w") as f: for line in sentences[int(len(sentences) * 0.95):]: f.write("%s\n" % line) tf.logging.info("All done.")
if __name__ == "__main__": tf.app.run(main)