@@ -389,7 +389,6 @@ def receive_dlq():
389
389
snapshot .match ("messages" , messages )
390
390
391
391
392
- # TODO: flaky against AWS
393
392
@markers .aws .validated
394
393
def test_report_batch_item_failures (
395
394
create_lambda_function ,
@@ -412,19 +411,27 @@ def test_report_batch_item_failures(
412
411
"get_destination_queue_url" , aws_client .sqs .get_queue_url (QueueName = destination_queue_name )
413
412
)
414
413
415
- # timeout in seconds, used for both the lambda and the queue visibility timeout.
416
- # increase to 10 if testing against AWS fails.
417
- retry_timeout = 8
414
+ # If an SQS queue is not receiving a lot of traffic, Lambda can take up to 20s between invocations.
415
+ # See AWS docs https://docs.aws.amazon.com/lambda/latest/dg/with-sqs.html.
416
+ retry_timeout = 6
417
+ visibility_timeout = 8
418
418
retries = 2
419
419
420
+ # AWS recommends a visibility timeout should be x6 a Lambda's retry timeout. To ensure a short test
421
+ # runtime, we just want to ensure messages are re-queued a couple of seconda after any potential timeouts.
422
+ # See https://docs.aws.amazon.com/lambda/latest/dg/services-sqs-configure.html#events-sqs-queueconfig
423
+ assert visibility_timeout > retry_timeout , (
424
+ "A lambda needs to finish processing prior to re-queuing invisible messages"
425
+ )
426
+
420
427
# set up lambda function
421
428
function_name = f"failing-lambda-{ short_uid ()} "
422
429
create_lambda_function (
423
430
func_name = function_name ,
424
431
handler_file = LAMBDA_SQS_BATCH_ITEM_FAILURE_FILE ,
425
432
runtime = Runtime .python3_12 ,
426
433
role = lambda_su_role ,
427
- timeout = retry_timeout , # timeout needs to be <= than visibility timeout
434
+ timeout = retry_timeout ,
428
435
envvars = {"DESTINATION_QUEUE_URL" : destination_url },
429
436
)
430
437
@@ -441,7 +448,7 @@ def test_report_batch_item_failures(
441
448
Attributes = {
442
449
"FifoQueue" : "true" ,
443
450
# the visibility timeout is implicitly also the time between retries
444
- "VisibilityTimeout" : str (retry_timeout ),
451
+ "VisibilityTimeout" : str (visibility_timeout ),
445
452
"RedrivePolicy" : json .dumps (
446
453
{"deadLetterTargetArn" : event_dlq_arn , "maxReceiveCount" : retries }
447
454
),
@@ -521,8 +528,14 @@ def test_report_batch_item_failures(
521
528
assert "Messages" not in dlq_messages or dlq_messages ["Messages" ] == []
522
529
523
530
# now wait for the second invocation result which is expected to have processed message 2 and 3
531
+ # Since we are re-queuing twice, with a visiblity timeout of 8s, this should instead be waiting for 20s => 8s x 2 retries (+ 4s margin).
532
+ # See AWS docs: https://docs.aws.amazon.com/AWSSimpleQueueService/latest/APIReference/API_ReceiveMessage.html#API_ReceiveMessage_RequestSyntax
533
+ second_timeout_with_margin = (visibility_timeout * 2 ) + 4
534
+ assert second_timeout_with_margin <= 20 , (
535
+ "An SQS ReceiveMessage operation cannot wait for more than 20s"
536
+ )
524
537
second_invocation = aws_client .sqs .receive_message (
525
- QueueUrl = destination_url , WaitTimeSeconds = retry_timeout + 2 , MaxNumberOfMessages = 1
538
+ QueueUrl = destination_url , WaitTimeSeconds = second_timeout_with_margin , MaxNumberOfMessages = 1
526
539
)
527
540
assert "Messages" in second_invocation
528
541
# hack to make snapshot work
0 commit comments