@@ -466,6 +466,55 @@ class AudioEncoding(proto.Enum):
466
466
in the audio header; otherwise the request returns an
467
467
[google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]
468
468
error code.
469
+
470
+ Values:
471
+ ENCODING_UNSPECIFIED (0):
472
+ Not specified.
473
+ LINEAR16 (1):
474
+ Uncompressed 16-bit signed little-endian
475
+ samples (Linear PCM).
476
+ FLAC (2):
477
+ ``FLAC`` (Free Lossless Audio Codec) is the recommended
478
+ encoding because it is lossless--therefore recognition is
479
+ not compromised--and requires only about half the bandwidth
480
+ of ``LINEAR16``. ``FLAC`` stream encoding supports 16-bit
481
+ and 24-bit samples, however, not all fields in
482
+ ``STREAMINFO`` are supported.
483
+ MULAW (3):
484
+ 8-bit samples that compand 14-bit audio
485
+ samples using G.711 PCMU/mu-law.
486
+ AMR (4):
487
+ Adaptive Multi-Rate Narrowband codec. ``sample_rate_hertz``
488
+ must be 8000.
489
+ AMR_WB (5):
490
+ Adaptive Multi-Rate Wideband codec. ``sample_rate_hertz``
491
+ must be 16000.
492
+ OGG_OPUS (6):
493
+ Opus encoded audio frames in Ogg container
494
+ (`OggOpus <https://wiki.xiph.org/OggOpus>`__).
495
+ ``sample_rate_hertz`` must be one of 8000, 12000, 16000,
496
+ 24000, or 48000.
497
+ SPEEX_WITH_HEADER_BYTE (7):
498
+ Although the use of lossy encodings is not recommended, if a
499
+ very low bitrate encoding is required, ``OGG_OPUS`` is
500
+ highly preferred over Speex encoding. The
501
+ `Speex <https://speex.org/>`__ encoding supported by Cloud
502
+ Speech API has a header byte in each block, as in MIME type
503
+ ``audio/x-speex-with-header-byte``. It is a variant of the
504
+ RTP Speex encoding defined in `RFC
505
+ 5574 <https://tools.ietf.org/html/rfc5574>`__. The stream is
506
+ a sequence of blocks, one block per RTP packet. Each block
507
+ starts with a byte containing the length of the block, in
508
+ bytes, followed by one or more frames of Speex data, padded
509
+ to an integral number of bytes (octets) as specified in RFC
510
+ 5574. In other words, each RTP header is replaced with a
511
+ single byte containing the block length. Only Speex wideband
512
+ is supported. ``sample_rate_hertz`` must be 16000.
513
+ WEBM_OPUS (9):
514
+ Opus encoded audio frames in WebM container
515
+ (`OggOpus <https://wiki.xiph.org/OggOpus>`__).
516
+ ``sample_rate_hertz`` must be one of 8000, 12000, 16000,
517
+ 24000, or 48000.
469
518
"""
470
519
ENCODING_UNSPECIFIED = 0
471
520
LINEAR16 = 1
@@ -646,6 +695,39 @@ class RecognitionMetadata(proto.Message):
646
695
class InteractionType (proto .Enum ):
647
696
r"""Use case categories that the audio recognition request can be
648
697
described by.
698
+
699
+ Values:
700
+ INTERACTION_TYPE_UNSPECIFIED (0):
701
+ Use case is either unknown or is something
702
+ other than one of the other values below.
703
+ DISCUSSION (1):
704
+ Multiple people in a conversation or discussion. For example
705
+ in a meeting with two or more people actively participating.
706
+ Typically all the primary people speaking would be in the
707
+ same room (if not, see PHONE_CALL)
708
+ PRESENTATION (2):
709
+ One or more persons lecturing or presenting
710
+ to others, mostly uninterrupted.
711
+ PHONE_CALL (3):
712
+ A phone-call or video-conference in which two
713
+ or more people, who are not in the same room,
714
+ are actively participating.
715
+ VOICEMAIL (4):
716
+ A recorded message intended for another
717
+ person to listen to.
718
+ PROFESSIONALLY_PRODUCED (5):
719
+ Professionally produced audio (eg. TV Show,
720
+ Podcast).
721
+ VOICE_SEARCH (6):
722
+ Transcribe spoken questions and queries into
723
+ text.
724
+ VOICE_COMMAND (7):
725
+ Transcribe voice commands, such as for
726
+ controlling a device.
727
+ DICTATION (8):
728
+ Transcribe speech to text to create a written
729
+ document, such as a text-message, email or
730
+ report.
649
731
"""
650
732
INTERACTION_TYPE_UNSPECIFIED = 0
651
733
DISCUSSION = 1
@@ -660,20 +742,63 @@ class InteractionType(proto.Enum):
660
742
class MicrophoneDistance (proto .Enum ):
661
743
r"""Enumerates the types of capture settings describing an audio
662
744
file.
745
+
746
+ Values:
747
+ MICROPHONE_DISTANCE_UNSPECIFIED (0):
748
+ Audio type is not known.
749
+ NEARFIELD (1):
750
+ The audio was captured from a closely placed
751
+ microphone. Eg. phone, dictaphone, or handheld
752
+ microphone. Generally if there speaker is within
753
+ 1 meter of the microphone.
754
+ MIDFIELD (2):
755
+ The speaker if within 3 meters of the
756
+ microphone.
757
+ FARFIELD (3):
758
+ The speaker is more than 3 meters away from
759
+ the microphone.
663
760
"""
664
761
MICROPHONE_DISTANCE_UNSPECIFIED = 0
665
762
NEARFIELD = 1
666
763
MIDFIELD = 2
667
764
FARFIELD = 3
668
765
669
766
class OriginalMediaType (proto .Enum ):
670
- r"""The original media the speech was recorded on."""
767
+ r"""The original media the speech was recorded on.
768
+
769
+ Values:
770
+ ORIGINAL_MEDIA_TYPE_UNSPECIFIED (0):
771
+ Unknown original media type.
772
+ AUDIO (1):
773
+ The speech data is an audio recording.
774
+ VIDEO (2):
775
+ The speech data originally recorded on a
776
+ video.
777
+ """
671
778
ORIGINAL_MEDIA_TYPE_UNSPECIFIED = 0
672
779
AUDIO = 1
673
780
VIDEO = 2
674
781
675
782
class RecordingDeviceType (proto .Enum ):
676
- r"""The type of device the speech was recorded with."""
783
+ r"""The type of device the speech was recorded with.
784
+
785
+ Values:
786
+ RECORDING_DEVICE_TYPE_UNSPECIFIED (0):
787
+ The recording device is unknown.
788
+ SMARTPHONE (1):
789
+ Speech was recorded on a smartphone.
790
+ PC (2):
791
+ Speech was recorded using a personal computer
792
+ or tablet.
793
+ PHONE_LINE (3):
794
+ Speech was recorded over a phone line.
795
+ VEHICLE (4):
796
+ Speech was recorded in a vehicle.
797
+ OTHER_OUTDOOR_DEVICE (5):
798
+ Speech was recorded outdoors.
799
+ OTHER_INDOOR_DEVICE (6):
800
+ Speech was recorded indoors.
801
+ """
677
802
RECORDING_DEVICE_TYPE_UNSPECIFIED = 0
678
803
SMARTPHONE = 1
679
804
PC = 2
@@ -1034,7 +1159,22 @@ class StreamingRecognizeResponse(proto.Message):
1034
1159
"""
1035
1160
1036
1161
class SpeechEventType (proto .Enum ):
1037
- r"""Indicates the type of speech event."""
1162
+ r"""Indicates the type of speech event.
1163
+
1164
+ Values:
1165
+ SPEECH_EVENT_UNSPECIFIED (0):
1166
+ No speech event specified.
1167
+ END_OF_SINGLE_UTTERANCE (1):
1168
+ This event indicates that the server has detected the end of
1169
+ the user's speech utterance and expects no additional
1170
+ speech. Therefore, the server will not process additional
1171
+ audio (although it may subsequently return additional
1172
+ results). The client should stop sending additional audio
1173
+ data, half-close the gRPC connection, and wait for any
1174
+ additional results until the server closes the gRPC
1175
+ connection. This event is only sent if ``single_utterance``
1176
+ was set to ``true``, and is not used otherwise.
1177
+ """
1038
1178
SPEECH_EVENT_UNSPECIFIED = 0
1039
1179
END_OF_SINGLE_UTTERANCE = 1
1040
1180
0 commit comments