Add real generation of waveforms for Opus files

3 months ago · 8bc906e765
1 changed files with 68 additions and 4 deletions
--- a/discord/file.py
+++ b/discord/file.py
@ -28,6 +28,9 @@ from typing import Any, Dict, Optional, Tuple, Union
 import os
 import io
 import base64
+from .oggparse import OggStream
+from .opus import Decoder
+import struct

 from .utils import MISSING

@ -85,8 +88,8 @@ class File:

            Voice files must be an audio only format.

-            A *non-exhaustive* list of supported formats are: `mp3`, `ogg`, `wav`, `aac`, and `flac`.
-
+            A *non-exhaustive* list of supported formats are: `ogg`, `mp3`, `wav`, `aac`, and `flac`.
+            
        .. versionadded:: 2.6

    duration: Optional[:class:`float`]
@ -171,9 +174,18 @@ class File:
    def waveform(self) -> str:
        """:class:`str`: The waveform data for the voice message.

+        .. note::
+            If a waveform was not given, it will be generated
+
+            Only supports generating the waveform for Opus format files, other files will be given a random waveform
+        
        .. versionadded:: 2.6"""
        if self._waveform is None:
-            return base64.b64encode(os.urandom(256)).decode('utf-8')
+            try:
+                self._waveform = self.generate_waveform()
+            except Exception:
+                self._waveform = base64.b64encode(os.urandom(256)).decode('utf-8')
+            self.reset()
        return self._waveform

    @filename.setter
@ -206,8 +218,60 @@ class File:
        if self.description is not None:
            payload['description'] = self.description

-        if self.duration is not None:
+        if self.voice:
            payload['duration_secs'] = self.duration
            payload['waveform'] = self.waveform

        return payload
+    
+    def generate_waveform(self) -> str:
+        self.reset()
+        ogg = OggStream(self.fp) # type: ignore
+        decoder = Decoder()
+        waveform: list[int] = []
+        prefixes = [b'OpusHead', b'OpusTags']
+        for packet in ogg.iter_packets():
+            if packet[:8] in prefixes:
+                continue
+
+            if b'vorbis' in packet:
+                raise TypeError("File format is 'vorbis'. Format of 'opus' is required for waveform generation")
+
+            # these are PCM bytes in 16-bit signed little-endian form
+            decoded = decoder.decode(packet, fec=False) 
+
+            # 16 bits -> 2 bytes per sample
+            num_samples = len(decoded) // 2
+            
+            # https://docs.python.org/3/library/struct.html#byte-order-size-and-alignment
+            format = '<' + 'h' * num_samples
+            samples: tuple[int] = struct.unpack(format, decoded)
+
+            waveform.extend(samples)
+        
+        # Make sure all values are positive
+        for i in range(len(waveform)):
+            if waveform[i] < 0:
+                waveform[i] = -waveform[i]
+        
+        # TODO: Figure out how discord sets the sample count
+        # Voice message I've been using has 40 samples, so using that for now
+        points_per_sample = len(waveform) // 40
+        sample_waveform: list[int] = []
+
+        total, count = 0, 0
+        # Average out the amplitudes for each point within a sample
+        for i in range(len(waveform)):
+            total += waveform[i]
+            count += 1
+            if i % points_per_sample == 0:
+                sample_waveform.append(total // count)
+                total, count = 0, 0
+
+        # Maximum value of a waveform is 0xff (255)
+        highest = max(sample_waveform)
+        mult = 255 / highest
+        for i in range(len(sample_waveform)):
+            sample_waveform[i] = int(sample_waveform[i] * mult)
+
+        return base64.b64encode(bytes(sample_waveform)).decode('utf-8')