ffmpeg simple player (4)--use SDL to play audio

SDL (English: Simple DirectMedia Layer) is a set of open source cross-platform multimedia development function libraries written in C language. SDL provides several functions to control images, sounds, input and output, allowing developers to develop application software across multiple platforms (Linux, Windows, Mac OS X, etc.) by using the same or similar code. Currently, SDL is mostly used in the development of multimedia applications such as games, simulators, and media players (from Wikipedia).

You can see that SDL can do many functions, including playing audio and designing GUI interfaces, but here we only use SDL to play audio.

How SDL plays audio

There are two modes for playing audio in SDL, the first is push mode (push) and the other is pull mode (pull). The former is that we actively fill the audio data into the device playback buffer, and the other is that SDL actively pulls data into the device playback buffer. Here we use the pull mode for playback, which is more commonly used.

Our operation process this time is

Initialize ffmpeg and SDL2 related components

Write a callback function for SDL2

Write decoding function

The way SDL2 uses the pull mode to play audio is that when the audio data in the device playback buffer is insufficient, SDL2 will call the callback function we provide, and we fill the audio data into the device playback buffer in the callback function. This enables audio playback.

SDL installation

Like the ffmpeg in the previous article, I still use compilation and installation and write one myself to reference the SDL library.Source code download, as for the compilation and installation process, please search on the Internet. Here I provide an article asrefer to

Paste my file here

set(SDL2ROOT /path/to/your/sdl2) # Fill in the folder path of the installed sdl2

 set(SDL2_INCLUDE_DIRS ${SDL2ROOT}/include)

 set(SDL2_LIBRARY_DIRS ${SDL2ROOT}/lib)

 find_library(SDL2_LIBS SDL2 ${SDL2_LIBRARY_DIRS})

and add in

find_package(SDL2 REQUIRED)
include_directories(${SDL2_INCLUDE_DIRS})
......
target_link_libraries(${PROJECT_NAME} ${SDL2_LIBS})

Decoding audio using ffmpeg

The playback part uses SDL2, but the process of decoding audio files to obtain data still uses ffmpeg. Here we use ffmpeg to decode the audio, and then pass the decoded audio data to SDL2 for playback. The decoding process is roughly the same as the previous decoding video, except that currently we are processing the audio stream instead of the video stream.

The preparation work is as follows, including introducing header files, initializing SDL, opening audio files, and configuring the decoder.

#include <iostream>
 #include <SDL2/>
 #include <queue>
 extern "C"
 {
 #include <libavformat/>
 #include <libavcodec/>
 #include <libavutil/>
 #include <libswscale/>
 #include <libavutil/>
 #include <libswresample/>
 }
 #define SDL_AUDIO_BUFFER_SIZE 2048
 queue<AVPacket> audioPackets; //Audio packet queue

 int main()
 {
     const string filename = "/home/ruby/Desktop/study/qtStudy/myPlayer/mad.mp4";
     AVFormatContext *formatCtx = nullptr;
     AVCodecContext *aCodecCtx = NULL;
     AVCodec *aCodec = NULL;
     int audioStream;

     //Initialize SDL
     if (SDL_Init(SDL_INIT_AUDIO | SDL_INIT_TIMER))
     {
         cout << "SDL_Init failed: " << SDL_GetError() << endl;
         return -1;
     }

     //Open audio file
     if (avformat_open_input(&formatCtx, filename.c_str(), nullptr, nullptr) != 0)
     {
         cout << "Unable to open audio file" << endl;
         return -1;
     }

     // Get stream information
     if (avformat_find_stream_info(formatCtx, nullptr) < 0)
     {
         cout << "Unable to obtain stream information" << endl;
         return -1;
     }

     // find the audio stream
     audioStream = -1;
     for (unsigned int i = 0; i < formatCtx->nb_streams; i++)
     {
         if (formatCtx->streams[i]->codecpar->codec_type == AVMEDIA_TYPE_AUDIO)
         {
             audioStream = i;
             break;
         }
     }
     if (audioStream == -1)
     {
         cout << "Audio stream not found" << endl;
         return -1;
     }

     // Get the decoder
     aCodec = avcodec_find_decoder(formatCtx->streams[audioStream]->codecpar->codec_id);
     if (!aCodec)
     {
         cout << "Decoder not found" << endl;
         return -1;
     }

     //Configure audio parameters
     aCodecCtx = avcodec_alloc_context3(aCodec);
     avcodec_parameters_to_context(aCodecCtx, formatCtx->streams[audioStream]->codecpar);
     aCodecCtx->pkt_timebase = formatCtx->streams[audioStream]->time_base;
     //Open the decoder
     if (avcodec_open2(aCodecCtx, aCodec, NULL) < 0)
     {
         cout << "Unable to open decoder" << endl;
         return -1;
     }

Next, we initialize the resampler. The resampler is used to convert the decoded audio data into the format we need. Here we convert the audio data into a format supported by SDL2.

SwrContext *swrCtx = swr_alloc(); // Apply for resampler memory
     /*Set related parameters*/
     av_opt_set_int(swrCtx, "in_channel_layout", aCodecCtx->channel_layout, 0);
     av_opt_set_int(swrCtx, "out_channel_layout", aCodecCtx->channel_layout, 0);
     av_opt_set_int(swrCtx, "in_sample_rate", aCodecCtx->sample_rate, 0);
     av_opt_set_int(swrCtx, "out_sample_rate", aCodecCtx->sample_rate, 0);
     av_opt_set_sample_fmt(swrCtx, "in_sample_fmt", aCodecCtx->sample_fmt, 0);
     av_opt_set_sample_fmt(swrCtx, "out_sample_fmt", AV_SAMPLE_FMT_S16, 0);
     swr_init(swrCtx); //Initialize the resampler

     AVPacket packet;

This concludes the configuration of decoding audio. The next step is to configure SDL2 playback related things.

/*Configure SDL audio*/
     SDL_AudioSpec wanted_spec;
     /*Configuration parameters*/
     wanted_spec.freq = aCodecCtx->sample_rate; // Sample rate for playing audio
     wanted_spec.format = AUDIO_S16; // Playback format, s16 means 16 bits
     wanted_spec.channels = aCodecCtx->channels;//The number of playback channels, that is, the number of audio channels
     wanted_spec.silence = 0;` // Silent filling data
     wanted_spec.samples = SDL_AUDIO_BUFFER_SIZE;//Callback function buffer size
     wanted_spec.callback = audio_callback; // The callback function entry is the function name
     wanted_spec.userdata = aCodecCtx; // User data, which is void pointer type

     /*Open*/
     SDL_OpenAudio(&wanted_spec, &spec)
     /*Start playing the audio, please note that after this sentence, the pull mode is enabled and the playback begins*/
     SDL_PauseAudio(0);

Here we specifically talk about the callback function buffer size. The larger the buffer, the more data can be stored. Similarly, the number of calls to the callback function will be reduced each time more data is obtained, which means the interval will increase. In this way, for long-term audio playback, playback interruptions and freezes caused by insufficient buffer data can be reduced. However, if the callback function call time is too long, the audio playback delay will be too large.

Of course, this excessive delay is for short-duration and high-frequency playback scenarios such as voice chat. In this case, the buffer can be reduced to reduce the delay. But for audio of known duration and played for a long time, the buffer can be appropriately increased to reduce playback interruptions.

Write the callback function below

int audio_buf_index = 0;
 int audio_buf_size = 0;
 void audio_callback(void *userdata, Uint8 *stream, int len)
 {
     AVCodecContext *aCodecCtx = (AVCodecContext *)userdata; // Convert user data to AVCodecContext class pointer for use
     int len1; // current data length
     int audio_size; //Decoded data length
     while (len > 0)
     {
         if (audio_buf_index >= audio_buf_size)
         {
             audio_size = audio_decode_frame(aCodecCtx, audio_buf, sizeof(audio_buf)); // Decode a frame of data
             if (audio_size < 0)
             {
                 audio_buf_size = 1024;
                 memset(audio_buf, 0, audio_buf_size);
             }
             else
             {
                 audio_buf_size = audio_size;
             }
             audio_buf_index = 0;
         }
         len1 = audio_buf_size - audio_buf_index;
         if (len1 > len)
             len1 = len;
         memcpy(stream, (uint8_t *)audio_buf + audio_buf_index, len1);
         len -= len1;
         stream += len1;
         audio_buf_index += len1;
     }
 }

The execution logic is as follows

/*
  * Callback process
  *
  * audio_buf_size is the decoded audio data size
  * audio_buf_index points to the data location in the current audio_buf_size that has been added to the stream
  * When audio_buf_index < audio_buf_size, it means that the data decoded last time has not been used up, and the remaining data decoded last time will be used.
  * Otherwise new audio data needs to be decoded
  * Also note that audio_buf_size and audio_buf_index are global variables, so their status is maintained between function calls.
  * When audio_buf_size < 0, that is, when decoding fails or the decoding data has been used up, fill in the silence data
  *
  * Len bytes of data will be written to the stream each time, and len < audio_buf_size - audio_buf_index may occur.
  * In this case, audio_buf_size will appear and be used up, so it will wait until the next callback to continue using it.
  */

Then implement audio_decode_frame()

int audio_decode_frame(AVCodecContext *aCodecCtx, uint8_t *audio_buf, int buf_size)
 {
     static AVPacket *pkt = av_packet_alloc();
     static AVFrame *frame = av_frame_alloc();
     int data_size = 0;
     int ret;

     // First get and send the data packet
     if(!())
     {
         *pkt = ();
         ();
     }

     //Send packet to decoder
     ret = avcodec_send_packet(aCodecCtx, pkt);
     av_packet_unref(pkt);
     if (ret < 0)
     {
         cout << "Failed to send packet to decoder" << endl;
         return -1;
     }

     // Then try to receive the decoded frame
     ret = avcodec_receive_frame(aCodecCtx, frame);
     if (ret == 0)
     {
         // Successfully receive the frame and perform resampling processing
         int out_samples = av_rescale_rnd(
             swr_get_delay(swr_ctx, aCodecCtx->sample_rate) + frame->nb_samples,
             frame->sample_rate, // Output sampling rate
             frame->sample_rate, //Input sampling rate
             AV_ROUND_UP);
         // Calculate the number of samples at the same sampling time and different sampling frequencies

         int out_buffer_size = av_samples_get_buffer_size(
             NULL,
             aCodecCtx->channels,
             out_samples,
             AV_SAMPLE_FMT_S16,
             1);

         if (out_buffer_size > audio_convert_buf_size)
         {
             av_free(audio_convert_buf);
             audio_convert_buf = (uint8_t *)av_malloc(out_buffer_size);
             audio_convert_buf_size = out_buffer_size;
         }

         //Perform resampling
         ret = swr_convert(
             swr_ctx,
             &audio_convert_buf,
             out_samples,
             (const uint8_t **)frame->data,
             frame->nb_samples);
         if (ret < 0)
         {
             cout << "Resampling conversion error" << endl;
             return -1;
         }

         data_size = ret * frame->channels * 2;
         memcpy(audio_buf, audio_convert_buf, data_size);
         return data_size;
     }
     else if (ret == AVERROR_EOF)
     {
         //The decoder has refreshed all data
         return -1;
     }
     else
     {
         // other errors
         cout << "An error occurred while decoding" << endl;
         return -1;
     }
 }