9999 evaluation_steps = _EVALUATION_STEPS ,
100100 ),
101101)
102+ _TEST_COMET = pointwise_metric .Comet (
103+ version = "COMET_22_SRC_REF" ,
104+ source_language = "en" ,
105+ target_language = "zh" ,
106+ )
107+ _TEST_METRICX = pointwise_metric .MetricX (
108+ version = "METRICX_24_SRC" ,
109+ source_language = "en" ,
110+ target_language = "zh" ,
111+ )
102112_TEST_METRICS = (
103113 "exact_match" ,
104114 "bleu" ,
139149 "reference" : ["test" , "ref" ],
140150 "context" : ["test" , "context" ],
141151 "instruction" : ["test" , "instruction" ],
152+ "source" : ["test" , "source" ],
142153 }
143154)
144155_TEST_EVAL_DATASET_SINGLE = pd .DataFrame ({"prompt" : ["test_prompt" , "text_prompt" ]})
305316 )
306317 ),
307318)
308- _MOCK_POINTEWISE_RESULT = (
319+ _MOCK_POINTWISE_RESULT = (
309320 gapic_evaluation_service_types .EvaluateInstancesResponse (
310321 pointwise_metric_result = gapic_evaluation_service_types .PointwiseMetricResult (
311322 score = 5 , explanation = "explanation"
423434 )
424435 ),
425436)
437+ _EXPECTED_COLUMN_MAPPING = {
438+ "context" : "context" ,
439+ "reference" : "reference" ,
440+ "response" : "response" ,
441+ "instruction" : "instruction" ,
442+ "prompt" : "prompt" ,
443+ "source" : "source" ,
444+ }
445+ _MOCK_MODEL_BASED_TRANSLATION_RESULT = (
446+ # The order of the responses is important.
447+ gapic_evaluation_service_types .EvaluateInstancesResponse (
448+ comet_result = gapic_evaluation_service_types .CometResult (score = 0.1 )
449+ ),
450+ gapic_evaluation_service_types .EvaluateInstancesResponse (
451+ metricx_result = gapic_evaluation_service_types .MetricxResult (score = 5 )
452+ ),
453+ gapic_evaluation_service_types .EvaluateInstancesResponse (
454+ comet_result = gapic_evaluation_service_types .CometResult (score = 0.9 )
455+ ),
456+ gapic_evaluation_service_types .EvaluateInstancesResponse (
457+ metricx_result = gapic_evaluation_service_types .MetricxResult (score = 20 )
458+ ),
459+ )
426460
427461
428462@pytest .fixture (scope = "module" )
@@ -465,16 +499,10 @@ def test_create_eval_task(self):
465499 assert test_eval_task .dataset .equals (_TEST_EVAL_DATASET_ALL_INCLUDED )
466500 assert test_eval_task .metrics == _TEST_METRICS
467501 assert test_eval_task .experiment == _TEST_EXPERIMENT
468- assert test_eval_task ._metric_column_mapping == {
469- "context" : "context" ,
470- "reference" : "reference" ,
471- "response" : "response" ,
472- "instruction" : "instruction" ,
473- "prompt" : "prompt" ,
474- }
502+ assert test_eval_task ._metric_column_mapping == _EXPECTED_COLUMN_MAPPING
475503
476504 @pytest .mark .parametrize ("api_transport" , ["grpc" , "rest" ])
477- def test_compute_automatic_metrics (self , api_transport ):
505+ def test_compute_exact_match_metric (self , api_transport ):
478506 aiplatform .init (
479507 project = _TEST_PROJECT ,
480508 location = _TEST_LOCATION ,
@@ -521,7 +549,7 @@ def test_compute_pointwise_metrics(self, api_transport):
521549 test_eval_task = EvalTask (
522550 dataset = _TEST_EVAL_DATASET_ALL_INCLUDED , metrics = test_metrics
523551 )
524- mock_metric_results = _MOCK_POINTEWISE_RESULT
552+ mock_metric_results = _MOCK_POINTWISE_RESULT
525553 with mock .patch .object (
526554 target = gapic_evaluation_services .EvaluationServiceClient ,
527555 attribute = "evaluate_instances" ,
@@ -543,6 +571,7 @@ def test_compute_pointwise_metrics(self, api_transport):
543571 "reference" ,
544572 "test_pointwise_metric/score" ,
545573 "test_pointwise_metric/explanation" ,
574+ "source" ,
546575 ]
547576 )
548577 assert test_result .metrics_table ["response" ].equals (
@@ -567,7 +596,7 @@ def test_compute_pointwise_metrics_free_string(self):
567596 metrics = [_TEST_POINTWISE_METRIC_FREE_STRING ],
568597 metric_column_mapping = {"abc" : "prompt" },
569598 )
570- mock_metric_results = _MOCK_POINTEWISE_RESULT
599+ mock_metric_results = _MOCK_POINTWISE_RESULT
571600 with mock .patch .object (
572601 target = gapic_evaluation_services .EvaluationServiceClient ,
573602 attribute = "evaluate_instances" ,
@@ -589,6 +618,7 @@ def test_compute_pointwise_metrics_free_string(self):
589618 "reference" ,
590619 "test_pointwise_metric_str/score" ,
591620 "test_pointwise_metric_str/explanation" ,
621+ "source" ,
592622 ]
593623 )
594624 assert test_result .metrics_table ["response" ].equals (
@@ -695,6 +725,7 @@ def test_compute_pointwise_metrics_without_model_inference(self, api_transport):
695725 "response" ,
696726 "summarization_quality/score" ,
697727 "summarization_quality/explanation" ,
728+ "source" ,
698729 ]
699730 )
700731 assert list (
@@ -707,6 +738,48 @@ def test_compute_pointwise_metrics_without_model_inference(self, api_transport):
707738 "explanation" ,
708739 ]
709740
741+ @pytest .mark .parametrize ("api_transport" , ["grpc" , "rest" ])
742+ def test_compute_model_based_translation_metrics_without_model_inference (
743+ self , api_transport
744+ ):
745+ aiplatform .init (
746+ project = _TEST_PROJECT ,
747+ location = _TEST_LOCATION ,
748+ api_transport = api_transport ,
749+ )
750+ test_metrics = [_TEST_COMET , _TEST_METRICX ]
751+ test_eval_task = EvalTask (
752+ dataset = _TEST_EVAL_DATASET_ALL_INCLUDED , metrics = test_metrics
753+ )
754+
755+ mock_metric_results = _MOCK_MODEL_BASED_TRANSLATION_RESULT
756+ with mock .patch .object (
757+ target = gapic_evaluation_services .EvaluationServiceClient ,
758+ attribute = "evaluate_instances" ,
759+ side_effect = mock_metric_results ,
760+ ):
761+ test_result = test_eval_task .evaluate ()
762+
763+ assert test_result .summary_metrics ["row_count" ] == 2
764+ assert test_result .summary_metrics ["comet/mean" ] == 0.5
765+ assert test_result .summary_metrics ["metricx/mean" ] == 12.5
766+ assert test_result .summary_metrics ["comet/std" ] == pytest .approx (0.5 , 0.6 )
767+ assert test_result .summary_metrics ["metricx/std" ] == pytest .approx (10 , 11 )
768+ assert set (test_result .metrics_table .columns .values ) == set (
769+ [
770+ "context" ,
771+ "instruction" ,
772+ "reference" ,
773+ "prompt" ,
774+ "response" ,
775+ "source" ,
776+ "comet/score" ,
777+ "metricx/score" ,
778+ ]
779+ )
780+ assert list (test_result .metrics_table ["comet/score" ].values ) == [0.1 , 0.9 ]
781+ assert list (test_result .metrics_table ["metricx/score" ].values ) == [5 , 20 ]
782+
710783 @pytest .mark .parametrize ("api_transport" , ["grpc" , "rest" ])
711784 def test_compute_automatic_metrics_with_custom_metric_spec (self , api_transport ):
712785 aiplatform .init (
@@ -940,6 +1013,7 @@ def test_compute_pairwise_metrics_without_model_inference(self, api_transport):
9401013 "instruction" ,
9411014 "pairwise_summarization_quality/pairwise_choice" ,
9421015 "pairwise_summarization_quality/explanation" ,
1016+ "source" ,
9431017 ]
9441018 )
9451019 assert list (
@@ -1281,7 +1355,7 @@ def test_evaluate_response_column_and_model_not_provided(self):
12811355 ):
12821356 test_eval_task .evaluate ()
12831357
1284- def test_evaluate_baseline_response_column_and_baseline_model_not_provided (
1358+ def test_evaluate_baseline_model_response_column_not_provided (
12851359 self ,
12861360 ):
12871361 test_eval_dataset = _TEST_EVAL_DATASET_SINGLE .copy (deep = True )
@@ -1302,6 +1376,63 @@ def test_evaluate_baseline_response_column_and_baseline_model_not_provided(
13021376 ):
13031377 test_eval_task .evaluate ()
13041378
1379+ def test_evaluate_response_column_not_provided (
1380+ self ,
1381+ ):
1382+ test_eval_dataset = _TEST_EVAL_DATASET_SINGLE
1383+ test_eval_task = EvalTask (
1384+ dataset = test_eval_dataset ,
1385+ metrics = ["exact_match" ],
1386+ )
1387+ with pytest .raises (
1388+ KeyError ,
1389+ match = re .escape (
1390+ (
1391+ "Required column `response` not found in the evaluation "
1392+ "dataset. The columns in the evaluation dataset are ['prompt']"
1393+ )
1394+ ),
1395+ ):
1396+ test_eval_task .evaluate ()
1397+
1398+ def test_evaluate_reference_column_not_provided (
1399+ self ,
1400+ ):
1401+ test_eval_dataset = pd .DataFrame ({"response" : ["test" , "text" ]})
1402+ test_eval_task = EvalTask (
1403+ dataset = test_eval_dataset ,
1404+ metrics = ["exact_match" ],
1405+ )
1406+ with pytest .raises (
1407+ KeyError ,
1408+ match = re .escape (
1409+ (
1410+ "Required column `reference` not found in the evaluation "
1411+ "dataset. The columns in the evaluation dataset are ['response']"
1412+ )
1413+ ),
1414+ ):
1415+ test_eval_task .evaluate ()
1416+
1417+ def test_evaluate_reference_or_source_column_not_provided (
1418+ self ,
1419+ ):
1420+ test_eval_dataset = pd .DataFrame ({"response" : ["test" , "text" ]})
1421+ test_eval_task = EvalTask (
1422+ dataset = test_eval_dataset ,
1423+ metrics = [_TEST_COMET , _TEST_METRICX ],
1424+ )
1425+ with pytest .raises (
1426+ KeyError ,
1427+ match = re .escape (
1428+ (
1429+ "Required column `source` not found in the evaluation "
1430+ "dataset. The columns in the evaluation dataset are ['response']"
1431+ )
1432+ ),
1433+ ):
1434+ test_eval_task .evaluate ()
1435+
13051436 def test_evaluate_invalid_prompt_template_variables (self ):
13061437 test_eval_task = EvalTask (
13071438 dataset = _TEST_EVAL_DATASET_SINGLE ,
@@ -1530,13 +1661,7 @@ def test_initialize_metric_column_mapping(self):
15301661 metric_column_mapping = metric_column_mapping ,
15311662 dataset = _TEST_EVAL_DATASET_ALL_INCLUDED ,
15321663 )
1533- assert converted_metric_column_mapping == {
1534- "prompt" : "prompt" ,
1535- "response" : "response" ,
1536- "reference" : "reference" ,
1537- "context" : "context" ,
1538- "instruction" : "instruction" ,
1539- }
1664+ assert converted_metric_column_mapping == _EXPECTED_COLUMN_MAPPING
15401665
15411666
15421667class TestPromptTemplate :
0 commit comments