Added summary-level rougeL scorer

zdou0830 · zdou0830 · commit 0c56a5c9262e · 2019-12-12T17:34:39.000Z
diff --git a/compare_mt/formatting.py b/compare_mt/formatting.py
@@ -22,15 +22,15 @@ def escape_latex(self, x):
             x = pat.sub(replace_with, x)
         return x
 
-    def __call__(self, x):
+    def __call__(self, x, latex=True):
         """Convert object to string with controlled decimals"""
         if isinstance(x, str):
-            return self.escape_latex(x)
+            return self.escape_latex(x) if latex else x
         elif isinstance(x, int):
             return f"{x:d}"
         elif isinstance(x, float):
             return f"{x:.{self.decimals}f}"
         else:
             str(x)
 
-fmt = Formatter(decimals=4)
+fmt = Formatter(decimals=4)
diff --git a/compare_mt/reporters.py b/compare_mt/reporters.py
@@ -132,7 +132,7 @@ def print_header(self, header):
 
   def print_tabbed_table(self, tab):
     for x in tab:
-      print('\t'.join([fmt(y) if y else '' for y in x]))
+      print('\t'.join([fmt(y, latex=False) if y else '' for y in x]))
     print()
 
   def generate_report(self, output_fig_file=None, output_fig_format=None, output_directory=None):
diff --git a/compare_mt/rouge/rouge_scorer.py b/compare_mt/rouge/rouge_scorer.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2018 The Google Research Authors.
+# Copyright 2019 The Google Research Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,20 +13,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+# Lint as: python2, python3
 """Computes rouge scores between two text blobs.
-
 Implementation replicates the functionality in the original ROUGE package. See:
-
 Lin, Chin-Yew. ROUGE: a Package for Automatic Evaluation of Summaries. In
 Proceedings of the Workshop on Text Summarization Branches Out (WAS 2004),
 Barcelona, Spain, July 25 - 26, 2004.
-
 Default options are equivalent to running:
 ROUGE-1.5.5.pl -e data -n 2 -a settings.xml
-
 Or with use_stemmer=True:
 ROUGE-1.5.5.pl -m -e data -n 2 -a settings.xml
-
 In these examples settings.xml lists input files and formats.
 """
 
@@ -38,16 +34,15 @@
 import re
 
 from nltk.stem import porter
-import numpy as np
 import six
-from six.moves import xrange  # pylint: disable=redefined-builtin
+from six.moves import map
+from six.moves import range
 from compare_mt.rouge import scoring
-from compare_mt.rouge import tokenizer
+from compare_mt.rouge import tokenize
 
 
 class RougeScorer(scoring.BaseScorer):
   """Calculate rouges scores between two blobs of text.
-
   Sample usage:
     scorer = RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
     scores = scorer.score('The quick brown fox jumps over the lazy dog',
@@ -56,11 +51,9 @@ class RougeScorer(scoring.BaseScorer):
 
   def __init__(self, rouge_types, use_stemmer=False):
     """Initializes a new RougeScorer.
-
     Valid rouge types that can be computed are:
       rougen (e.g. rouge1, rouge2): n-gram based scoring.
       rougeL: Longest common subsequence based scoring.
-
     Args:
       rouge_types: A list of rouge types to calculate.
       use_stemmer: Bool indicating whether Porter stemmer should be used to
@@ -74,7 +67,6 @@ def __init__(self, rouge_types, use_stemmer=False):
 
   def score(self, target, prediction):
     """Calculates rouge scores between the target and prediction.
-
     Args:
       target: Text containing the target (ground truth) text.
       prediction: Text containing the predicted text.
@@ -84,15 +76,29 @@ def score(self, target, prediction):
       ValueError: If an invalid rouge type is encountered.
     """
 
-    target_tokens = tokenizer.tokenize(target, self._stemmer)
-    prediction_tokens = tokenizer.tokenize(prediction, self._stemmer)
+    target_tokens = tokenize.tokenize(target, self._stemmer)
+    prediction_tokens = tokenize.tokenize(prediction, self._stemmer)
     result = {}
 
     for rouge_type in self.rouge_types:
       if rouge_type == "rougeL":
         # Rouge from longest common subsequences.
         scores = _score_lcs(target_tokens, prediction_tokens)
-      elif re.match(r"rouge[0-9]$", rouge_type):
+      elif rouge_type == "rougeLsum":
+        # Note: Does not support multi-line text.
+        def get_sents(text):
+          # Assume sentences are separated by newline.
+          sents = six.ensure_str(text).split("\n")
+          sents = [x for x in sents if len(x)]
+          return sents
+
+        target_tokens_list = [
+            tokenize.tokenize(s, self._stemmer) for s in get_sents(target)]
+        prediction_tokens_list = [
+            tokenize.tokenize(s, self._stemmer) for s in get_sents(prediction)]
+        scores = _summary_level_lcs(target_tokens_list,
+                                    prediction_tokens_list)
+      elif re.match(r"rouge[0-9]$", six.ensure_str(rouge_type)):
         # Rouge from n-grams.
         n = int(rouge_type[5:])
         if n <= 0:
@@ -109,7 +115,6 @@ def score(self, target, prediction):
 
 def _create_ngrams(tokens, n):
   """Creates ngrams from the given list of tokens.
-
   Args:
     tokens: A list of tokens from which ngrams are created.
     n: Number of tokens to use, e.g. 2 for bigrams.
@@ -118,14 +123,13 @@ def _create_ngrams(tokens, n):
   """
 
   ngrams = collections.Counter()
-  for ngram in (tuple(tokens[i:i + n]) for i in xrange(len(tokens) - n + 1)):
+  for ngram in (tuple(tokens[i:i + n]) for i in range(len(tokens) - n + 1)):
     ngrams[ngram] += 1
   return ngrams
 
 
 def _score_lcs(target_tokens, prediction_tokens):
   """Computes LCS (Longest Common Subsequence) rouge scores.
-
   Args:
     target_tokens: Tokens from the target text.
     prediction_tokens: Tokens from the predicted text.
@@ -137,16 +141,8 @@ def _score_lcs(target_tokens, prediction_tokens):
     return scoring.Score(precision=0, recall=0, fmeasure=0)
 
   # Compute length of LCS from the bottom up in a table (DP appproach).
-  cols = len(prediction_tokens) + 1
-  rows = len(target_tokens) + 1
-  lcs_table = np.zeros((rows, cols))
-  for i in xrange(1, rows):
-    for j in xrange(1, cols):
-      if target_tokens[i - 1] == prediction_tokens[j - 1]:
-        lcs_table[i, j] = lcs_table[i - 1, j - 1] + 1
-      else:
-        lcs_table[i, j] = max(lcs_table[i - 1, j], lcs_table[i, j - 1])
-  lcs_length = lcs_table[-1, -1]
+  lcs_table = _lcs_table(target_tokens, prediction_tokens)
+  lcs_length = lcs_table[-1][-1]
 
   precision = lcs_length / len(prediction_tokens)
   recall = lcs_length / len(target_tokens)
@@ -155,9 +151,106 @@ def _score_lcs(target_tokens, prediction_tokens):
   return scoring.Score(precision=precision, recall=recall, fmeasure=fmeasure)
 
 
+def _lcs_table(ref, can):
+  """Create 2-d LCS score table."""
+  rows = len(ref)
+  cols = len(can)
+  lcs_table = [[0] * (cols + 1) for _ in range(rows + 1)]
+  for i in range(1, rows + 1):
+    for j in range(1, cols + 1):
+      if ref[i - 1] == can[j - 1]:
+        lcs_table[i][j] = lcs_table[i - 1][j - 1] + 1
+      else:
+        lcs_table[i][j] = max(lcs_table[i - 1][j], lcs_table[i][j - 1])
+  return lcs_table
+
+
+def _backtrack_norec(t, ref, can):
+  """Read out LCS."""
+  i = len(ref)
+  j = len(can)
+  lcs = []
+  while i > 0 and j > 0:
+    if ref[i - 1] == can[j - 1]:
+      lcs.insert(0, i-1)
+      i -= 1
+      j -= 1
+    elif t[i][j - 1] > t[i - 1][j]:
+      j -= 1
+    else:
+      i -= 1
+  return lcs
+
+
+def _summary_level_lcs(ref_sent, can_sent):
+  """ROUGE: Summary-level LCS, section 3.2 in ROUGE paper.
+  Args:
+    ref_sent: list of tokenized reference sentences
+    can_sent: list of tokenized candidate sentences
+  Returns:
+    summary level ROUGE score
+  """
+  if not ref_sent or not can_sent:
+    return scoring.Score(precision=0, recall=0, fmeasure=0)
+
+  m = sum(map(len, ref_sent))
+  n = sum(map(len, can_sent))
+  if not n or not m:
+    return scoring.Score(precision=0, recall=0, fmeasure=0)
+
+  # get token counts to prevent double counting
+  token_cnts_r = collections.Counter()
+  token_cnts_c = collections.Counter()
+  for s in ref_sent:
+    # s is a list of tokens
+    token_cnts_r.update(s)
+  for s in can_sent:
+    token_cnts_c.update(s)
+
+  hits = 0
+  for r in ref_sent:
+    lcs = _union_lcs(r, can_sent)
+    # Prevent double-counting:
+    # The paper describes just computing hits += len(_union_lcs()),
+    # but the implementation prevents double counting. We also
+    # implement this as in version 1.5.5.
+    for t in lcs:
+      if token_cnts_c[t] > 0 and token_cnts_r[t] > 0:
+        hits += 1
+        token_cnts_c[t] -= 1
+        token_cnts_r[t] -= 1
+
+  recall = hits / m
+  precision = hits / n
+  fmeasure = scoring.fmeasure(precision, recall)
+  return scoring.Score(precision=precision, recall=recall, fmeasure=fmeasure)
+
+
+def _union_lcs(ref, c_list):
+  """Find union LCS between a ref sentence and list of candidate sentences.
+  Args:
+    ref: list of tokens
+    c_list: list of list of indices for LCS into reference summary
+  Returns:
+    List of tokens in ref representing union LCS.
+  """
+  lcs_list = [lcs_ind(ref, c) for c in c_list]
+  return [ref[i] for i in _find_union(lcs_list)]
+
+
+def _find_union(lcs_list):
+  """Finds union LCS given a list of LCS."""
+  return sorted(list(set().union(*lcs_list)))
+
+
+def lcs_ind(ref, can):
+  """Returns one of the longest lcs."""
+  t = _lcs_table(ref, can)
+  return _backtrack_norec(t, ref, can)
+
+
 def _score_ngrams(target_ngrams, prediction_ngrams):
   """Compute n-gram based rouge scores.
-
   Args:
     target_ngrams: A Counter object mapping each ngram to number of
       occurrences for the target text.
diff --git a/compare_mt/rouge/tokenize.py b/compare_mt/rouge/tokenize.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2018 The Google Research Authors.
+# Copyright 2019 The Google Research Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,40 +13,39 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+# Lint as: python2, python3
 """A library for tokenizing text."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
 import re
+import six
 
 
 def tokenize(text, stemmer):
   """Tokenize input text into a list of tokens.
-
   This approach aims to replicate the approach taken by Chin-Yew Lin in
   the original ROUGE implementation.
-
   Args:
     text: A text blob to tokenize.
     stemmer: An optional stemmer.
-
   Returns:
     A list of string tokens extracted from input text.
   """
 
   # Convert everything to lowercase.
   text = text.lower()
   # Replace any non-alpha-numeric characters with spaces.
-  text = re.sub(r"[^a-z0-9]+", " ", text)
+  text = re.sub(r"[^a-z0-9]+", " ", six.ensure_str(text))
 
   tokens = re.split(r"\s+", text)
   if stemmer:
     # Only stem words more than 3 characters long.
     tokens = [stemmer.stem(x) if len(x) > 3 else x for x in tokens]
 
   # One final check to drop any empty or invalid tokens.
-  tokens = [x for x in tokens if re.match(r"^[a-z0-9]+$", x)]
+  tokens = [x for x in tokens if re.match(r"^[a-z0-9]+$", six.ensure_str(x))]
 
   return tokens
diff --git a/compare_mt/scorers.py b/compare_mt/scorers.py
@@ -544,8 +544,14 @@ def score_sentence(self, ref, out):
       out = [self._stemmer.stem(x) if len(x) > 3 else x for x in out]
     
     if self.rouge_type == 'rougeL':
+      ref, out = self.tokenize(" ".join(ref)), self.tokenize(" ".join(out))
       scores = rouge_scorer._score_lcs(ref, out)
+    elif self.rouge_type == 'rougeLsum':
+      refs = [self.tokenize(s) for s in self.get_sents(ref)]
+      outs = [self.tokenize(s) for s in self.get_sents(out)]
+      scores = rouge_scorer._summary_level_lcs(refs, outs)
     elif re.match(r"rouge[0-9]$", self.rouge_type):
+      ref, out = self.tokenize(" ".join(ref)), self.tokenize(" ".join(out))
       n = int(self.rouge_type[5:])
       if n <= 0:
         raise ValueError(f"rougen requires positive n: {self.rouge_type}")
@@ -567,6 +573,18 @@ def score_sentence(self, ref, out):
 
     return self.scale * score_value, None
 
+  def get_sents(self, tokens):
+    # assume sentences are separated by "."
+    sents = " ".join(tokens).split(".")
+    sents = [x for x in sents if len(x)]
+    return sents
+
+  def tokenize(self, tokens):
+    text = re.sub(r"[^a-zA-Z0-9]+", " ", tokens)
+    tokens = re.split(r"\s+", text)
+    tokens = [x for x in tokens if len(x)]
+    return tokens
+
   def name(self):
     return self.rouge_type
 
@@ -859,7 +877,7 @@ def create_scorer_from_profile(profile, case_insensitive=False, meteor_directory
     return RibesScorer(case_insensitive=case_insensitive)
   elif profile == 'chrf':
     return ChrFScorer(case_insensitive=case_insensitive)
-  elif re.match(r"rouge[0-9L]$", profile):
+  elif re.match(r"rouge[0-9L](sum)?$", profile):
     return RougeScorer(rouge_type=profile, case_insensitive=case_insensitive)
   elif profile == 'wer':
     return WERScorer(case_insensitive=case_insensitive)