sumeval icon indicating copy to clipboard operation
sumeval copied to clipboard

ROUGE-L for summary-level

Open kan-bayashi opened this issue 3 years ago • 0 comments

Hi. I have a feature request for ROUGE-L calculation with multiple sentences. There are two manners for ROUGE-L calculation: (1) sentence-level and (2) summary-level. See also: https://github.com/google-research/google-research/tree/master/rouge#two-flavors-of-rouge-l

In pythonrouge, if we use a new line for each sentence, we can calculate summary-level ROUGE-L. If we use the concatenated sentence, the value will be sentence-level ROUGE-L.

In Google's rouge implemention, they support two types of ROUGE-L (rougeL and rougeLsum).

From my experiment, sumeval supports only sentence-level ROUGE-L. Is it correct? And do you have a plan to implement such an option?

Here is the test code to compare the results:

import numpy as np
import pytest

from pythonrouge.pythonrouge import Pythonrouge
from rouge_score.rouge_scorer import RougeScorer
from sumeval.metrics.rouge import RougeCalculator

SUMMARY = [
    (
        "the unusual format has been captured in a series of photographs by visual "
        "journalist anna erickson ."
    ),
    (
        "meet seattle 's rolling neighborhood of rvs , where each unassuming vehicle "
        "is a capsule home ."
    ),
    (
        "meet bud dodson , 57 , and welcome to his home : an rv in seattle 's sodo "
        "where he watches over the parking lot in exchange for a spot"
    ),
]

ABSTRACT = [
    (
        "around 30 people live a floating life in seattle 's sodo "
        "( south of downtown ) area in their rvs"
    ),
    (
        "there is one parking lot in particular where the owner lets them act as "
        "watchmen in exchange for a spot to live"
    ),
    (
        "visual journalist anna erickson , who photographed the community , said "
        "they are just grateful to have a home"
    ),
]

@pytest.mark.parametrize("stemming", [True, False])
def test_rouge_mutilple_sentences(stemming):
    # In pythonrouge, sentences are represented as the list of sentences.
    # If use list of sentences, each sentence has a new line.
    summ = SUMMARY
    abst = ABSTRACT

    pythonrouge = Pythonrouge(
        summary_file_exist=False,
        summary=[summ],
        reference=[[abst]],
        n_gram=2,
        ROUGE_SU4=False,
        ROUGE_L=True,
        recall_only=False,
        stemming=stemming,
        stopwords=False,
        length_limit=False,
    )
    res = pythonrouge.calc_score()
    pythonrouge_rouge_1 = res["ROUGE-1-F"]
    pythonrouge_rouge_2 = res["ROUGE-2-F"]
    pythonrouge_rouge_l = res["ROUGE-L-F"]
    print(f"pythonrouge_rouge_1={pythonrouge_rouge_1}")
    print(f"pythonrouge_rouge_2={pythonrouge_rouge_2}")
    print(f"pythonrouge_rouge_l={pythonrouge_rouge_l}")

    # In rouge_score, sentences are represented as the concatenated sentence with "\n"
    # In sumeval, it does not care about how to concat the sentences (?)
    abst = "\n".join(ABSTRACT)
    summ = "\n".join(SUMMARY)

    rouge_calculator = RougeCalculator(stopwords=False, stemming=stemming, lang="en")
    rouge_calculator_rouge_1 = rouge_calculator.rouge_n(summ, abst, n=1)
    rouge_calculator_rouge_2 = rouge_calculator.rouge_n(summ, abst, n=2)
    rouge_calculator_rouge_l = rouge_calculator.rouge_l(summ, abst)
    print(f"rouge_calculator_rouge_1={rouge_calculator_rouge_1}")
    print(f"rouge_calculator_rouge_2={rouge_calculator_rouge_2}")
    print(f"rouge_calculator_rouge_l={rouge_calculator_rouge_l}")

    rouge_scorer = RougeScorer(
        ["rouge1", "rouge2", "rougeL", "rougeLsum"], use_stemmer=stemming
    )
    res = rouge_scorer.score(abst, summ)
    rouge_scorer_rouge_1 = res["rouge1"].fmeasure
    rouge_scorer_rouge_2 = res["rouge2"].fmeasure
    rouge_scorer_rouge_l = res["rougeL"].fmeasure
    rouge_scorer_rouge_lsum = res["rougeLsum"].fmeasure
    print(f"rouge_scorer_rouge_1={rouge_scorer_rouge_1}")
    print(f"rouge_scorer_rouge_2={rouge_scorer_rouge_2}")
    print(f"rouge_scorer_rouge_l={rouge_scorer_rouge_l}")
    print(f"rouge_scorer_rouge_lsum={rouge_scorer_rouge_lsum}")

    try:
        np.testing.assert_array_almost_equal(
            np.array(
                [
                    rouge_scorer_rouge_1,
                    rouge_scorer_rouge_2,
                    rouge_scorer_rouge_l,
                ]
            ),
            np.array(
                [
                    rouge_calculator_rouge_1,
                    rouge_calculator_rouge_2,
                    rouge_calculator_rouge_l,
                ]
            ),
            decimal=5,
        )
    except AssertionError as e:
        if stemming:
            # If we use stemming in sumeval, the value will be different
            # https://github.com/chakki-works/sumeval/issues/20
            pass
        else:
            raise e
    np.testing.assert_array_almost_equal(
        np.array(
            [
                pythonrouge_rouge_1,
                pythonrouge_rouge_2,
                pythonrouge_rouge_l,
            ]
        ),
        np.array(
            [
                rouge_scorer_rouge_1,
                rouge_scorer_rouge_2,
                rouge_scorer_rouge_lsum,
            ]
        ),
        decimal=5,
    )

kan-bayashi avatar Sep 29 '20 08:09 kan-bayashi