FFmpeg transcoding audio and video timestamp setting analysis

Audio timestamp settings

The following code is analyzed based on FFmpeg n5.1.2

The specific timestamp data about the audio in the following documents comes from the following transcoding commands:

./ffmpeg_g -rw_timeout 5000000 -i 'rtmp:///live/test' -acodec libfdk_aac -b:a 64k -ac 2 -ar 48000 -profile:a aac_low  -vcodec libx264 -b:v 2000k -level 3.1 -vprofile high  -strict -2 -preset medium -bf 3  -f flv -loglevel level+info -vf "scale='720:-2'"  'rtmp:///live/dest'

It's possible that callstack's lines of code don't match the source code because some logging was added manually.

Set st->start_time

There are two AVStream start_times that need to be set, the audio one and the video one

Call stack.

#0  update_initial_timestamps (s=0x2024dc0, stream_index=1, dts=<optimized out>, pts=579486901, pkt=<optimized out>) at libavformat/:899
#1  0x0000000000697e74 in compute_pkt_fields (s=s@entry=0x2024dc0, st=st@entry=0x2029340, pc=pc@entry=0x0, pkt=pkt@entry=0x20250c0,     next_dts=next_dts@entry=-9223372036854775808, next_pts=next_pts@entry=-9223372036854775808) at libavformat/:1124
#2  0x0000000000699274 in read_frame_internal (s=s@entry=0x2024dc0, pkt=pkt@entry=0x20250c0) at libavformat/:1371
#3  0x000000000069a723 in avformat_find_stream_info (ic=0x2024dc0, options=0x0) at libavformat/:2663
#4  0x00000000004a3abf in open_input_file (o=o@entry=0x7fffffffd098, filename=0x7fffffffe48c "rtmp:///alicdn/jingtian") at fftools/    ffmpeg_opt.c:1286
#5  0x000000000049fd24 in open_files (l=0x2024718, inout=inout@entry=0x15d599d "input", open_file=open_file@entry=0x4a346c <open_input_file>) at fftools/    ffmpeg_opt.c:3542
#6  0x00000000004a9f67 in ffmpeg_parse_options (argc=argc@entry=38, argv=argv@entry=0x7fffffffe028) at fftools/ffmpeg_opt.c:3582
#7  0x0000000000499311 in main (argc=38, argv=0x7fffffffe028) at fftools/:4683

Code:

static void update_initial_timestamps(AVFormatContext *s, int stream_index,
                                      int64_t dts, int64_t pts, AVPacket *pkt)
{
    ...

    if (has_decode_delay_been_guessed(st))
        update_dts_from_pts(s, stream_index, pktl);

    if (st->start_time == AV_NOPTS_VALUE) {
        if (st->codecpar->codec_type == AVMEDIA_TYPE_AUDIO || !(pkt->flags & AV_PKT_FLAG_DISCARD)) {
            st->start_time = pts; //Takes the time of the first frame of the audio
        }
        if (st->codecpar->codec_type == AVMEDIA_TYPE_AUDIO && st->codecpar->sample_rate)
            st->start_time = av_sat_add64(st->start_time, av_rescale_q(sti->skip_samples, (AVRational){1, st->codecpar->sample_rate}, st->time_base)); //This logic also walks in，stillsti->skip_samplesbecause of0，The final value is unchanged.。
    }
}

Calculate ic->start_time

The next step is to use the st->start_time from the previous step to calculate the ic->start_time. The audio and video have their own st->start_time saved separately, and the smaller of the two values will be taken when calculating the ic->start_time.

Note: The function update_stream_timings is called twice, but the final ic->start_time result is the same.

Call stack.

#0  update_stream_timings (ic=ic@entry=0x2024dc0) at libavformat/:1619
#1  0x00000000006965cf in fill_all_stream_timings (ic=ic@entry=0x2024dc0) at libavformat/:1701
#2  0x000000000069bbd2 in estimate_timings (old_offset=13, ic=0x2024dc0) at libavformat/:1945
#3  avformat_find_stream_info (ic=0x2024dc0, options=<optimized out>) at libavformat/:2951
#4  0x00000000004a3abf in open_input_file (o=o@entry=0x7fffffffd098, filename=0x7fffffffe48c "rtmp:///alicdn/jingtian") at fftools/    ffmpeg_opt.c:1286
#5  0x000000000049fd24 in open_files (l=0x2024718, inout=inout@entry=0x15d58dd "input", open_file=open_file@entry=0x4a346c <open_input_file>) at fftools/    ffmpeg_opt.c:3542
#6  0x00000000004a9f67 in ffmpeg_parse_options (argc=argc@entry=38, argv=argv@entry=0x7fffffffe028) at fftools/ffmpeg_opt.c:3582
#7  0x0000000000499311 in main (argc=38, argv=0x7fffffffe028) at fftools/:4683

Code:

static void update_stream_timings(AVFormatContext *ic)
{
...

for (unsigned i = 0; i < ic->nb_streams; i++) { //Here it will traverse both audio and videoAVStream
    AVStream *const st = ic->streams[i];
    int is_text = st->codecpar->codec_type == AVMEDIA_TYPE_SUBTITLE ||
                  st->codecpar->codec_type == AVMEDIA_TYPE_DATA;

    if (st->start_time != AV_NOPTS_VALUE && st->time_base.den) {
        start_time1 = av_rescale_q(st->start_time, st->time_base,
                                   AV_TIME_BASE_Q); //particle marking the following noun as a direct objectAVStream(used form a nominal expression)start_timeDo a little time base conversion.，1/1000convert1/100000，也就是particle marking the following noun as a direct object值增加1000times (multiplier)
        if (is_text)
            start_time_text = FFMIN(start_time_text, start_time1);
        else
            start_time = FFMIN(start_time, start_time1);//这里取两者之间(used form a nominal expression)较小值，也就是音视频首帧(used form a nominal expression)较小者
        end_time1 = av_rescale_q_rnd(st->duration, st->time_base,
                                     AV_TIME_BASE_Q,
                                     AV_ROUND_NEAR_INF|AV_ROUND_PASS_MINMAX);
        if (end_time1 != AV_NOPTS_VALUE && (end_time1 > 0 ? start_time1 <= INT64_MAX - end_time1 : start_time1 >= INT64_MIN - end_time1)) {
            end_time1 += start_time1;
            if (is_text)
                end_time_text = FFMAX(end_time_text, end_time1);
            else
                end_time = FFMAX(end_time, end_time1);
        }
        for (AVProgram *p = NULL; (p = av_find_program_from_stream(ic, p, i)); ) {
            if (p->start_time == AV_NOPTS_VALUE || p->start_time > start_time1)
                p->start_time = start_time1;
            if (p->end_time < end_time1)
                p->end_time = end_time1;
        }
    }
    if (st->duration != AV_NOPTS_VALUE) {
        duration1 = av_rescale_q(st->duration, st->time_base,
                                 AV_TIME_BASE_Q);
        if (is_text)
            duration_text = FFMAX(duration_text, duration1);
        else
            duration = FFMAX(duration, duration1);
    }
}
...
if (start_time != INT64_MAX) {
    ic->start_time = start_time; //assign a value toic->start_time
    if (end_time != INT64_MIN) {
        if (ic->nb_programs > 1) {
            for (unsigned i = 0; i < ic->nb_programs; i++) {
                AVProgram *const p = ic->programs[i];

                if (p->start_time != AV_NOPTS_VALUE &&
                    p->end_time > p->start_time &&
                    p->end_time - (uint64_t)p->start_time <= INT64_MAX)
                    duration = FFMAX(duration, p->end_time - p->start_time);
            }
        } else if (end_time >= start_time && end_time - (uint64_t)start_time <= INT64_MAX) {
            duration = FFMAX(duration, end_time - start_time);
        }
    }
}

}

Set f->ts_offset

After setting ic->start_time in the previous step, this value will be assigned to f->ts_offset, which will be used next to adjust the timestamps of each audio and video frame.

After setting the ic->start_time by calling avformat_find_stream_info in open_input_file, set f->ts_offset in the logic below this function:.

static int open_input_file(OptionsContext *o, const char *filename)
{
    ...

    timestamp = (o->start_time == AV_NOPTS_VALUE) ? 0 : o->start_time;
    /* add the stream start time */
    if (!o->seek_timestamp && ic->start_time != AV_NOPTS_VALUE)
        timestamp += ic->start_time; //timestampset toic->start_time

    f->ts_offset = o->input_ts_offset - (copy_ts ? (start_at_zero && ic->start_time != AV_NOPTS_VALUE ? ic->start_time : 0) : timestamp); //This is the place for0-timestamp particle marking the following noun as a direct objectf->ts_offsetset toic->start_timereciprocal of，It's gone negative.

 }

Re-write the timestamp of the packet

Each audio frame is read, the timestamp pts is readjusted, and the timestamp of the first frame after the adjustment starts from 0, i.e., each AVPacket is subtracted from f->ts_offset, and the timebase conversion is performed.

static int process_input(int file_index)
{
    ...
    if (pkt->dts ! = AV_NOPTS_VALUE)
        pkt->dts += av_rescale_q(ifile->ts_offset, AV_TIME_BASE_Q, ist->st->time_base); //Transform the time base of ts_offset back to 1/1000 from 1/1000000, i.e., the first frame of the audio The relative number of timestamps, calculated so that each frame's time is reduced by the timestamp of the first frame.
    if (pkt->pts ! = AV_NOPTS_VALUE)
        pkt->pts += av_rescale_q(ifile->ts_offset, AV_TIME_BASE_Q, ist->st->time_base);
    ...
}

Call stack.

#0  process_input (file_index=0) at fftools/:4071
#1  transcode_step () at fftools/:4456
#2  transcode () at fftools/:4510
#3  main (argc=<optimized out>, argv=<optimized out>) at fftools/:4705

Pass the timestamp of the AVPacket to the AVFrame

After unpacking and adjusting the timestamps, the next step is to decode the audio frames. After unpacking, the timestamps of the AVPacket are assigned directly to the AVFrame.

int ff_decode_frame_props(AVCodecContext *avctx, AVFrame *frame)
{
   AVPacket *pkt = avctx->internal->last_pkt_props;
   static const struct {
       enum AVPacketSideDataType packet;
       enum AVFrameSideDataType frame;
   } sd[] = {
       { AV_PKT_DATA_REPLAYGAIN , AV_FRAME_DATA_REPLAYGAIN },
       { AV_PKT_DATA_DISPLAYMATRIX, AV_FRAME_DATA_DISPLAYMATRIX },
       { AV_PKT_DATA_SPHERICAL, AV_FRAME_DATA_SPHERICAL },
       { AV_PKT_DATA_STEREO3D, AV_FRAME_DATA_STEREO3D },
       { AV_PKT_DATA_AUDIO_SERVICE_TYPE, AV_FRAME_DATA_AUDIO_SERVICE_TYPE },
       { AV_PKT_DATA_MASTERING_DISPLAY_METADATA, AV_FRAME_DATA_MASTERING_DISPLAY_METADATA },
       { AV_PKT_DATA_CONTENT_LIGHT_LEVEL, AV_FRAME_DATA_CONTENT_LIGHT_LEVEL },
       { AV_PKT_DATA_A53_CC, AV_FRAME_DATA_A53_CC },
       { AV_PKT_DATA_ICC_PROFILE, AV_FRAME_DATA_ICC_PROFILE },
       { AV_PKT_DATA_S12M_TIMECODE, AV_FRAME_DATA_S12M_TIMECODE },
       { AV_PKT_DATA_DYNAMIC_HDR10_PLUS, AV_FRAME_DATA_DYNAMIC_HDR_PLUS },
   };

   if (!(ffcodec(avctx->codec)->caps_internal & FF_CODEC_CAP_SETS_FRAME_PROPS)) {
       frame->pts = pkt->pts; // Put it here.AVPacketThe timestamp is assigned to theAVFrame
       frame->pkt_pos = pkt->pos;
       frame->pkt_duration = pkt->duration;
       frame->pkt_size = pkt->size;

    ...
}

Call stack.

#0  ff_decode_frame_props (avctx=avctx@entry=0x203a4c0, frame=frame@entry=0x2ea2800) at libavcodec/:1292
#1  0x00000000007fbc73 in ff_get_buffer (avctx=avctx@entry=0x203a4c0, frame=0x2ea2800, flags=flags@entry=0) at libavcodec/:1468
#2  0x0000000000b879fd in frame_configure_elements (avctx=avctx@entry=0x203a4c0) at libavcodec/aacdec_template.c:184
#3  0x0000000000b8bdc9 in aac_decode_frame_int (avctx=avctx@entry=0x203a4c0, frame=frame@entry=0x2ea2800, got_frame_ptr=got_frame_ptr@entry=0x7fffffffd3f4, gb=gb@entry=0x7fffffffd340, avpkt=0x2ea2b40) at libavcodec/aacdec_template.c:3271
#4  0x0000000000b8ce0f in aac_decode_frame (avctx=0x203a4c0, frame=0x2ea2800, got_frame_ptr=0x7fffffffd3f4, avpkt=0x2ea2b40) at libavcodec/aacdec_template.c:3513
#5  0x00000000007fa202 in decode_simple_internal (discarded_samples=<synthetic pointer>, frame=0x2ea2800, avctx=0x203a4c0) at libavcodec/:317
#6  decode_simple_receive_frame (frame=<optimized out>, avctx=<optimized out>) at libavcodec/:526
#7  decode_receive_frame_internal (avctx=avctx@entry=0x203a4c0, frame=0x2ea2800) at libavcodec/:550
#8  0x00000000007faa55 in avcodec_send_packet (avctx=avctx@entry=0x203a4c0, avpkt=avpkt@entry=0x20465c0) at libavcodec/:620
#9  0x00000000004af5a9 in decode (avctx=0x203a4c0, frame=0x206a200, got_frame=0x7fffffffd52c, pkt=0x20465c0) at fftools/:2116
#10 0x00000000004b383b in decode_audio (decode_failed=<synthetic pointer>, got_output=0x7fffffffd52c, pkt=0x20465c0, ist=0x203a2c0) at fftools/:2160
#11 process_input_packet (ist=ist@entry=0x203a2c0, pkt=0x2134c40, no_eof=no_eof@entry=0) at fftools/:2488
#12 0x000000000049b4db in process_input (file_index=<optimized out>) at fftools/:4310
#13 transcode_step () at fftools/:4457
#14 transcode () at fftools/:4511
#15 main (argc=<optimized out>, argv=<optimized out>) at fftools/:4706

Adjustment of timestamps in AVFrame

Audio packet decoded into AVFrame, in the previous step of the preliminary set up a timestamp, the next step also needs to be adjusted, that is, to convert the time base from 1/1000 to 1/sample rate, the conversion of the sample rate is not a direct calculation, because the audio sample rate has the advantage of a special, calculated that the interval between each frame can not be guaranteed to be strictly equal, for example, 44,100 Hz, usually set up a few frames 23ms will be set after a 24ms frame to make up for the two cases will be handled separately:

If duration is 23ms, the timestamp calculation will use the value filter_in_rescale_delta_last, which is the timestamp value of the current frame estimated when calculating the timestamp of the previous frame.
If duration is not 23ms, directly perform a time base conversion (rounding up) on the timestamp obtained in the previous step.

static int decode_video(InputStream *ist, AVPacket *pkt, int *got_output, int64_t *duration_pts, int eof,
                    int *decode_failed)
{
    if (pkt && pkt->duration && ist->prev_pkt_pts ! = AV_NOPTS_VALUE &&
    pkt->pts ! = AV_NOPTS_VALUE && pkt->pts - ist->prev_pkt_pts > pkt->duration)
        ist->filter_in_rescale_delta_last = AV_NOPTS_VALUE; //If the difference between the two neighboring frames is greater than the duration, the last value is ignored, i.e., 24ms is encountered as stated above
    if (pkt)
        ist->prev_pkt_pts = pkt->pts;
    if (decoded_frame->pts ! = AV_NOPTS_VALUE)
        decoded_frame->pts = av_rescale_delta(decoded_frame_tb, decoded_frame->pts,
                                          (AVRational){1, avctx->sample_rate}, decoded_frame->nb_samples, &ist->filter_in_rescale_delta_last,
                                          (AVRational){1, avctx->sample_rate}); // Compute the final timestamp.
}

int64_t av_rescale_delta(AVRational in_tb, int64_t in_ts, AVRational fs_tb, int duration, int64_t *last, AVRational out_tb){
    int64_t a, b, this.

    av_assert0(in_ts ! = AV_NOPTS_VALUE); av_assert0(duration & & AV_NOPTS_VALUE)
    av_assert0(duration >= 0);


    if (*last == AV_NOPTS_VALUE || !duration || in_tb.num*(int64_t)out_tb.den <= out_tb.num*(int64_t)in_tb.den) {
 simple_round.

        *last = av_rescale_q(in_ts, in_tb, fs_tb) + duration.
        // If you ignore the last value and do the time base conversion directly, the above also calculates last out, which is the current timestamp + duration, and estimates to get the timestamp of the next frame.
        return av_rescale_q(in_ts, in_tb, out_tb);
    }
    // If not ignored, two boundary values are computed
    a = av_rescale_q_rnd(2*in_ts-1, in_tb, fs_tb, AV_ROUND_DOWN) >>1;;
    b = (av_rescale_q_rnd(2*in_ts+1, in_tb, fs_tb, AV_ROUND_UP )+1)>>1;
    if (*last < 2*a - b || *last > 2*b - a)
        goto simple_round;
    // Use the last value if it's inside the bounds, otherwise use the bounds value.
    this = av_clip64(*last, a, b);
    // Calculate the last value
    *last = this + duration;
    // time base conversion
    return av_rescale_q(this, fs_tb, out_tb);
}

Call stack.

#0  decode_audio (decode_failed=<synthetic pointer>, got_output=0x7fffffffd52c, pkt=0x209a680, ist=0x2028c40) at fftools/:2207
#1  process_input_packet (ist=ist@entry=0x2028c40, pkt=0x209a440, no_eof=no_eof@entry=0) at fftools/:2488
#2  0x000000000049b4db in process_input (file_index=<optimized out>) at fftools/:4310
#3  transcode_step () at fftools/:4457
#4  transcode () at fftools/:4511
#5  main (argc=<optimized out>, argv=<optimized out>) at fftools/:4706

Setting timestamps for resampled audio frames

After the decoding is completed, the AVFrame will be sent to the filter for resampling, and the delay caused by it will be taken into account in setting the timestamp after resampling, and there will be two kinds of time compensation when calculating the timestamp:

No automatic timestamp compensation (min_compensation >= FLT_MAX): in this case, the timestamp will be passed and compensated by a delay.
Use automatic timestamp compensation (min_compensation < FLT_MAX): in this case, the output timestamp will match the output sample number.
Automatic time compensation, which is compensation by calculating the delay and subtracting it, was not used in the tests.

The whole calculation process is to first turn the time base of the audio frame into 1/input sample rate * output sample rate, then pass the time to swr_next_pts for timestamping, and finally turn the time base into 1/output sample rate. (The principle of the delay algorithm needs further research.)

static int filter_frame(AVFilterLink *inlink, AVFrame *insamplesref)
{
    AResampleContext *aresample = inlink->dst->priv;
    const int n_in = insamplesref->nb_samples;
    int64_t delay;
    int n_out = n_in * aresample->ratio + 32;
    AVFilterLink *const outlink = inlink->dst->outputs[0];
    AVFrame *outsamplesref;
    int ret;

    delay = swr_get_delay(aresample->swr, outlink->sample_rate);
    if (delay > 0)
        n_out += FFMIN(delay, FFMAX(4096, n_out));

    outsamplesref = ff_get_audio_buffer(outlink, n_out);

    if(!outsamplesref) {
        av_frame_free(&insamplesref);
        return AVERROR(ENOMEM);
    }

    av_frame_copy_props(outsamplesref, insamplesref);
    outsamplesref->format = outlink->format;
#if FF_API_OLD_CHANNEL_LAYOUT
FF_DISABLE_DEPRECATION_WARNINGS
    outsamplesref->channels = outlink->ch_layout.nb_channels;
    outsamplesref->channel_layout = outlink->channel_layout;
FF_ENABLE_DEPRECATION_WARNINGS
#endif
    ret = av_channel_layout_copy(&outsamplesref->ch_layout, &outlink->ch_layout);
    if (ret < 0)
        return ret;
    outsamplesref->sample_rate = outlink->sample_rate;

    if(insamplesref->pts != AV_NOPTS_VALUE) {
        int64_t inpts = av_rescale(insamplesref->pts, inlink->time_base.num * (int64_t)outlink->sample_rate * inlink->sample_rate, inlink->time_base.den); //transform time base
        int64_t outpts= swr_next_pts(aresample->swr, inpts);//Calculate output timestamp
        aresample->next_pts =
        outsamplesref->pts = ROUNDED_DIV(outpts, inlink->sample_rate); //Finally, the time base is then turned into1/Output Sample Rate
    } else {
        outsamplesref->pts = AV_NOPTS_VALUE;
    }
    n_out = swr_convert(aresample->swr, outsamplesref->extended_data, n_out,
                                 (void *)insamplesref->extended_data, n_in);
    if (n_out <= 0) {
        av_frame_free(&outsamplesref);
        av_frame_free(&insamplesref);
        return 0;
    }

    aresample->more_data = outsamplesref->nb_samples == n_out; // Indicate that there is probably more data in our buffers

    outsamplesref->nb_samples = n_out;

    ret = ff_filter_frame(outlink, outsamplesref);
    av_frame_free(&insamplesref);
    return ret;
}

 int64_t swr_next_pts(struct SwrContext *s, int64_t pts){
   if(pts == INT64_MIN)
       return s->outpts;

   if (s->firstpts == AV_NOPTS_VALUE)
       s->outpts = s->firstpts = pts;
   
   //Automatic compensation for time not in use，countdelayAnd lose
   if(s->min_compensation >= FLT_MAX) {
       return (s->outpts = pts - swr_get_delay(s, s->in_sample_rate * (int64_t)s->out_sample_rate));
   } else { //Automatic time-of-use compensation
       int64_t delta = pts - swr_get_delay(s, s->in_sample_rate * (int64_t)s->out_sample_rate) - s->outpts + s->drop_output*(int64_t)s->in_sample_rate;
       double fdelta = delta /(double)(s->in_sample_rate * (int64_t)s->out_sample_rate);

       if(fabs(fdelta) > s->min_compensation) {
           if(s->outpts == s->firstpts || fabs(fdelta) > s->min_hard_compensation){
               int ret;
               if(delta > 0) ret = swr_inject_silence(s, delta / s->out_sample_rate);
               else ret = swr_drop_output (s, -delta / s-> in_sample_rate);
               if(ret<0){
                   av_log(s, AV_LOG_ERROR, "Failed to compensate for timestamp delta of %f\n", fdelta);
               }
           } else if(s->soft_compensation_duration && s->max_soft_compensation) {
               int duration = s->out_sample_rate * s->soft_compensation_duration;
               double max_soft_compensation = s->max_soft_compensation / (s->max_soft_compensation < 0 ? -s->in_sample_rate : 1);
               int comp = av_clipf(fdelta, -max_soft_compensation, max_soft_compensation) * duration ;
               av_log(s, AV_LOG_VERBOSE, "compensating audio timestamp drift:%f compensation:%d in:%d\n", fdelta, comp, duration);
               swr_set_compensation(s, comp, duration);
           }
       }

       return s->outpts;
   }
 }

Call stack.

#0  filter_frame (inlink=inlink@entry=0x20a2f80, insamplesref=0x20a2cc0) at libavfilter/af_aresample.c:209
#1  0x00000000004ce109 in ff_filter_frame_framed (frame=0x20a2cc0, link=0x20a2f80) at libavfilter/:990
#2  ff_filter_frame_to_filter (link=0x20a2f80) at libavfilter/:1138
#3  ff_filter_activate_default (filter=<optimized out>) at libavfilter/:1187
#4  ff_filter_activate (filter=<optimized out>) at libavfilter/:1345
#5  0x00000000004d01a2 in ff_filter_graph_run_once (graph=graph@entry=0x209e900) at libavfilter/:1351
#6  0x00000000004d1362 in push_frame (graph=0x209e900) at libavfilter/:169
#7  av_buffersrc_add_frame_flags (ctx=0x20a1d40, frame=frame@entry=0x203d600, flags=flags@entry=4) at libavfilter/:252
#8  0x00000000004b35a5 in ifilter_send_frame (keep_reference=<optimized out>, frame=0x203d600, ifilter=0x206a080) at fftools/:2068
#9  send_frame_to_filters (ist=ist@entry=0x2150a40, decoded_frame=decoded_frame@entry=0x203d600) at fftools/:2138
#10 0x00000000004b4c63 in decode_audio (decode_failed=<synthetic pointer>, got_output=0x7fffffffd52c, pkt=0x203d880, ist=0x2150a40) at fftools/    :2209
#11 process_input_packet (ist=ist@entry=0x2150a40, pkt=0x215a3c0, no_eof=no_eof@entry=0) at fftools/:2488
#12 0x000000000049b4db in process_input (file_index=<optimized out>) at fftools/:4310
#13 transcode_step () at fftools/:4457
#14 transcode () at fftools/:4511
#15 main (argc=<optimized out>, argv=<optimized out>) at fftools/:4706

After resampling is complete, the processed audio frames are placed inside the queue:

int ff_filter_frame(AVFilterLink *link, AVFrame *frame)
 {
     int ret;
     FF_TPRINTF_START(NULL, filter_frame); ff_tlog_link(NULL, link, 1); ff_tlog(NULL, " "); tlog_ref(NULL, frame, 1);
 
     /* Consistency checks */
     if (link->type == AVMEDIA_TYPE_VIDEO) {
         if (strcmp(link->dst->filter->name, "buffersink") &&
             strcmp(link->dst->filter->name, "format") &&
             strcmp(link->dst->filter->name, "idet") &&
             strcmp(link->dst->filter->name, "null") &&
             strcmp(link->dst->filter->name, "scale")) {
             av_assert1(frame->format == link->format);
             av_assert1(frame->width == link->w);
             av_assert1(frame->height == link->h);
         }
     } else {
         if (frame->format != link->format) {
             av_log(link->dst, AV_LOG_ERROR, "Format change is not supported\n");
             goto error;
         }
         if (av_channel_layout_compare(&frame->ch_layout, &link->ch_layout)) {
             av_log(link->dst, AV_LOG_ERROR, "Channel layout change is not supported\n");
             goto error;
         }
         if (frame->sample_rate != link->sample_rate) {
             av_log(link->dst, AV_LOG_ERROR, "Sample rate change is not supported\n");
             goto error;
         }
     }
 
     link->frame_blocked_in = link->frame_wanted_out = 0;
     link->frame_count_in++;
     link->sample_count_in += frame->nb_samples;
     filter_unblock(link->dst);
     ret = ff_framequeue_add(&link->fifo, frame); //Insert Queue
     if (ret < 0) {
         av_frame_free(&frame);
         return ret;
     }
     ff_filter_set_ready(link->dst, 300);
     return 0;
 
 error:
     av_frame_free(&frame);
     return AVERROR_PATCHWELCOME;
 }

Fixed-length sample data from the audio frame queue, adjust timestamps

The next step is to take the samples out of the queue, usually 1024 samples, and when you need to split a frame to make up the 1024 samples, the timestamp of this frame needs to be adjusted:.

static int take_samples(AVFilterLink *link, unsigned min, unsigned max,
                        AVFrame **rframe)
{
    AVFrame *frame0, *frame, *buf;
    unsigned nb_samples, nb_frames, i, p;
    int ret;

    /* Note: this function relies on no format changes and must only be
       called with enough samples. */
    av_assert1(samples_ready(link, link->min_samples));
    frame0 = frame = ff_framequeue_peek(&link->fifo, 0);
    if (!link->fifo.samples_skipped && frame->nb_samples >= min && frame->nb_samples <= max) {
        *rframe = ff_framequeue_take(&link->fifo);
        return 0;
    }
    nb_frames = 0;
    nb_samples = 0;
    while (1) {
        if (nb_samples + frame->nb_samples > max) {
            if (nb_samples < min)
                nb_samples = max;
            break;
        }
        nb_samples += frame->nb_samples;
        nb_frames++;
        if (nb_frames == ff_framequeue_queued_frames(&link->fifo))
            break;
        frame = ff_framequeue_peek(&link->fifo, nb_frames);
    }

    buf = ff_get_audio_buffer(link, nb_samples);
    if (!buf)
        return AVERROR(ENOMEM);
    ret = av_frame_copy_props(buf, frame0);
    if (ret < 0) {
        av_frame_free(&buf);
        return ret;
    }

    p = 0;
    //Read the full frame of sampled data first
    for (i = 0; i < nb_frames; i++) {
        frame = ff_framequeue_take(&link->fifo);
        av_samples_copy(buf->extended_data, frame->extended_data, p, 0,
                        frame->nb_samples, link->ch_layout.nb_channels, link->format);
        p += frame->nb_samples;
        av_frame_free(&frame);
    }
    //It didn't come together enough after reading，needpeeknext frame，plastic injectionnsampling point，then callff_framequeue_skip_samplesSkip this.nsampling point
    if (p < nb_samples) {
        unsigned n = nb_samples - p;
        frame = ff_framequeue_peek(&link->fifo, 0);
        av_samples_copy(buf->extended_data, frame->extended_data, p, 0, n,
                        link->ch_layout.nb_channels, link->format);
        ff_framequeue_skip_samples(&link->fifo, n, link->time_base);
    }

    *rframe = buf;
    return 0;
}


void ff_framequeue_skip_samples(FFFrameQueue *fq, size_t samples, AVRational time_base)
{
    FFFrameBucket *b;
    size_t bytes;
    int planar, planes, i;

    check_consistency(fq);
    av_assert1(fq->queued);
    b = bucket(fq, 0);
    av_assert1(samples < b->frame->nb_samples);
    planar = av_sample_fmt_is_planar(b->frame->format);
    planes = planar ? b->frame->ch_layout.nb_channels : 1;
    bytes = samples * av_get_bytes_per_sample(b->frame->format);
    if (!planar)
        bytes *= b->frame->ch_layout.nb_channels;
    if (b->frame->pts != AV_NOPTS_VALUE)
        b->frame->pts += av_rescale_q(samples, av_make_q(1, b->frame->sample_rate), time_base);//Skip this.些sample的时候时间戳也need调整。
    b->frame->nb_samples -= samples;
    b->frame->linesize[0] -= bytes;
    for (i = 0; i < planes; i++)
        b->frame->extended_data[i] += bytes;
    for (i = 0; i < planes && i < AV_NUM_DATA_POINTERS; i++)
        b->frame->data[i] = b->frame->extended_data[i];
    fq->total_samples_tail += samples;
    fq->samples_skipped = 1;
    ff_framequeue_update_peeked(fq, 0);
}

Call stack.

#0  take_samples (rframe=<synthetic pointer>, max=1024, min=1024, link=0x20b1300) at libavfilter/:1058
#1  ff_inlink_consume_samples (link=link@entry=0x20b1300, min=min@entry=1024, max=max@entry=1024, rframe=rframe@entry=0x7fffffffd548) at libavfilter/    :1437
#2  0x00000000004d08d5 in get_frame_internal (ctx=ctx@entry=0x20b0440, frame=frame@entry=0x2035100, flags=flags@entry=2, samples=1024) at libavfilter/    :135
#3  0x00000000004d0a2c in av_buffersink_get_frame_flags (ctx=ctx@entry=0x20b0440, frame=frame@entry=0x2035100, flags=flags@entry=2) at libavfilter/    :172
#4  0x00000000004b3134 in reap_filters (flush=flush@entry=0) at fftools/:1399
#5  0x000000000049b54b in transcode_step () at fftools/:4467
#6  transcode () at fftools/:4511
#7  main (argc=<optimized out>, argv=<optimized out>) at fftools/:4706

Encoded audio frames set timestamp

After the punch sampling is done, do_audio_out is called to encode, where the timestamp is set using a queue to take out the correct timestamp of the encoded frame.

Depending on the encoder you need to set a delay parameter, that is, the encoder needs delay time to get out of the frame, libfdk-aac this encoder time is 2048 samples, to the first frame of the timestamp minus the delay, the first frame of the timestamp of the previous said 0, so the last became -2048, added to the queue. The next timestamps added to the queue are all resampled timestamps: 1024 2048 3072 4088..., so here's the problem: -2048 is the time stamp of the first frame. So here's the problem: -2048 jumps to 1024 in one go, in between you need to insert 2 timestamps to ensure a smooth timestamp increment, this logic is implemented in ff_af_queue_remove:

void ff_af_queue_remove(AudioFrameQueue *afq, int nb_samples, int64_t *pts,
                        int64_t *duration)
{
    int64_t out_pts = AV_NOPTS_VALUE;
    int removed_samples = 0;
    int i;

    if (afq->frame_count || afq->frame_alloc) {
        if (afq->frames->pts != AV_NOPTS_VALUE)
            out_pts = afq->frames->pts;
    }
    if(!afq->frame_count)
        av_log(afq->avctx, AV_LOG_WARNING, "Trying to remove %d samples, but the queue is empty\n", nb_samples);
    if (pts)
        *pts = ff_samples_to_time_base(afq->avctx, out_pts);

    for(i=0; nb_samples && i<afq->frame_count; i++){
        int n= FFMIN(afq->frames[i].duration, nb_samples);
        afq->frames[i].duration -= n;
        nb_samples -= n;
        removed_samples += n;
        if(afq->frames[i].pts != AV_NOPTS_VALUE)
            afq->frames[i].pts += n;
    }
    afq->remaining_samples -= removed_samples;
    i -= i && afq->frames[i-1].duration; //The two timestamps that are filled in are not taken out of the queue，Here.ibecause of0
    //memmoveThe data from the previous frame is still being used.，But the timestamp is updated in the following logic
    memmove(afq->frames, afq->frames + i, sizeof(*afq->frames) * (afq->frame_count - i));
    afq->frame_count -= i;

    if(nb_samples){
        av_assert0(!afq->frame_count);
        av_assert0(afq->remaining_samples == afq->remaining_delay);
        if(afq->frames && afq->frames[0].pts != AV_NOPTS_VALUE)
            //Update the timestamp here （number one-1024 第二次because of0）
            afq->frames[0].pts += nb_samples;
        av_log(afq->avctx, AV_LOG_DEBUG, "Trying to remove %d more samples than there are in the queue\n", nb_samples);
    }
    if (duration)
        *duration = ff_samples_to_time_base(afq->avctx, removed_samples);
}

Calling the stack

#0  ff_af_queue_add (afq=afq@entry=0x2038e30, f=f@entry=0x20c85c0) at libavcodec/audio_frame_queue.c:49
#1  0x00000000008bf159 in aac_encode_frame (avctx=0x202d4c0, avpkt=0x20a7f00, frame=0x20c85c0, got_packet_ptr=0x7fffffffd2ac) at libavcodec/    :387
#2  0x0000000000820eac in encode_simple_internal (avpkt=0x20a7f00, avctx=0x202d4c0) at libavcodec/:234
#3  encode_simple_receive_packet (avpkt=<optimized out>, avctx=<optimized out>) at libavcodec/:295
#4  encode_receive_packet_internal (avctx=avctx@entry=0x202d4c0, avpkt=0x20a7f00) at libavcodec/:348
#5  0x000000000082138a in avcodec_send_frame (avctx=avctx@entry=0x202d4c0, frame=frame@entry=0x204b380) at libavcodec/:444
#6  0x00000000004af758 in encode_frame (of=of@entry=0x215e580, ost=ost@entry=0x2038ac0, frame=frame@entry=0x204b380) at fftools/:922
#7  0x00000000004b32f6 in do_audio_out (frame=0x204b380, ost=0x2038ac0, of=0x215e580) at fftools/:1038
#8  reap_filters (flush=flush@entry=0) at fftools/:1436
#9  0x000000000049b54b in transcode_step () at fftools/:4467
#10 transcode () at fftools/:4511
#11 main (argc=<optimized out>, argv=<optimized out>) at fftools/:4706


#0  ff_af_queue_remove (afq=0x21700f0, nb_samples=1024, pts=0x2075188, duration=0x20751c0) at libavcodec/audio_frame_queue.c:99
#1  0x00000000008bf2cd in aac_encode_frame (avctx=0x206cc40, avpkt=0x2075180, frame=0x209b000, got_packet_ptr=0x7fffffffd2ac) at libavcodec/    :436
#2  0x0000000000820eac in encode_simple_internal (avpkt=0x2075180, avctx=0x206cc40) at libavcodec/:234
#3  encode_simple_receive_packet (avpkt=<optimized out>, avctx=<optimized out>) at libavcodec/:295
#4  encode_receive_packet_internal (avctx=avctx@entry=0x206cc40, avpkt=0x2075180) at libavcodec/:348
#5  0x000000000082138a in avcodec_send_frame (avctx=avctx@entry=0x206cc40, frame=frame@entry=0x2065ec0) at libavcodec/:444
#6  0x00000000004af758 in encode_frame (of=of@entry=0x20378c0, ost=ost@entry=0x203c040, frame=frame@entry=0x2065ec0) at fftools/:922
#7  0x00000000004b32f6 in do_audio_out (frame=0x2065ec0, ost=0x203c040, of=0x20378c0) at fftools/:1038
#8  reap_filters (flush=flush@entry=0) at fftools/:1436
#9  0x000000000049b54b in transcode_step () at fftools/:4467
#10 transcode () at fftools/:4511
#11 main (argc=<optimized out>, argv=<optimized out>) at fftools/:4706

Set timestamps for final encoded frames

The time base of the timestamp after the last step of encoding is still 1/output sample rate, and the final timestamp has a time base of 1/1000, which requires a bit of conversion:

void of_write_packet(OutputFile *of, AVPacket *pkt, OutputStream *ost,
                 int unqueue)
{

...
    av_packet_rescale_ts(pkt, ost->mux_timebase, ost->st->time_base); //ost->mux_timebasebecause of1/48000 ost->st->time_basebecause of 1/1000
...
}

Calling the stack

#0  of_write_packet (of=of@entry=0x215f000, pkt=0x2046a40, ost=0x2046600, unqueue=unqueue@entry=0) at fftools/ffmpeg_mux.c:140
#1  0x00000000004af18b in output_packet (of=of@entry=0x215f000, pkt=pkt@entry=0x2046a40, ost=ost@entry=0x2046600, eof=eof@entry=0) at fftools/    :740
#2  0x00000000004afe53 in encode_frame (of=of@entry=0x215f000, ost=ost@entry=0x2046600, frame=frame@entry=0x2071a80) at fftools/:1017
#3  0x00000000004b32f6 in do_audio_out (frame=0x2071a80, ost=0x2046600, of=0x215f000) at fftools/:1038
#4  reap_filters (flush=flush@entry=0) at fftools/:1436
#5  0x000000000049b54b in transcode_step () at fftools/:4467
#6  transcode () at fftools/:4511
#7  main (argc=<optimized out>, argv=<optimized out>) at fftools/:4706

Once the timestamp of the AVPacket is set, it can be encapsulated and sent out.

Video timestamp settings

Setting of pts timestamp after decoding video frames

It can be known that after the video timestamp is decapsulated, it also needs to be subtracted from ts->offset, which is obtained as the smaller value of the first audio and video frames, and the timestamp value of the audio frame is smaller, so the value of the first video frame will be greater than 0.

Decoded into an AVFrame it is sent to the filter.

stack:

#0  sch_dec_send (sch=0x2f91700, dec_idx=1, frame=frame@entry=0x7fffc80008c0) at fftools/ffmpeg_sched.c:2169
#1  0x000000000049e0c9 in packet_decode (frame=0x7fffc80008c0, pkt=0x7fffc8000b40, dp=0x3faa740) at fftools/ffmpeg_dec.c:760
#2  decoder_thread (arg=0x3faa740) at fftools/ffmpeg_dec.c:889
#3  0x00000000004b85e9 in task_wrapper (arg=0x3faac68) at fftools/ffmpeg_sched.c:2447
#4  0x00007ffff60b4ea5 in start_thread () from /usr/lib64/.0
#5  0x00007ffff4dedb0d in clone () from /usr/lib64/.6

Setting the timestamp in the filter

In the filter will adjust the time base of the timestamp, from 1/1000 to 1/out_fps, that is, every output frame, the timestamp plus 1. There is also need to pay attention to the point is that in this step will be a special processing of video frames, the relevant parameter is called vsync, a total of several ways to deal with this parameter, here to say a few words about this parameter's role in detail VSYNC_CFR: In the output frame rate and input frame rate ratio changes, filter will copy the frame or discard the frame to ensure that the output frame rate is stable.

static void video_sync_process(OutputFilterPriv *ofp, AVFrame *frame,
                               int64_t *nb_frames, int64_t *nb_frames_prev)
{

    //First calculate how long the current frame needs to last in the output stream，For example, the source stream is20FPS，The output stream is30FPS,Then a frame in the source stream in the output stream has to last for1.5frame duration（frame50ms,The presentation time of each frame in the output stream becomes75ms,So you need to copy frames to get the frame rate right.）
    duration = frame->duration * av_q2d(frame->time_base) / av_q2d(ofp->tb_out);
    //把当前frame duration戳转换成输出流(used form a nominal expression)时间戳（The time base of the output stream is1/out_fps）
    sync_ipts = adjust_frame_pts_to_encoder_tb(frame, ofp->tb_out, ofp->ts_offset);
    /* delta0 is the "drift" between the input frame and
     * where it would fall in the output. */
    //把当前frame duration戳和输出帧下一个时间戳做对比。(It can be understood like this.，当前frame duration戳经过转换后发现，超过了输出流下一frame duration戳，
    //Then one frame should be copied，complementary frame rate（输出frame duration戳需要赶上当前输出frame duration戳），in contrast，如果当前frame duration远远没到下一frame duration戳，
    //则可以丢弃帧来等待输入frame duration戳赶上来)
    delta0 = sync_ipts - ofp->next_pts;
    delta  = delta0 + duration;

    // tracks the number of times the PREVIOUS frame should be duplicated,
    // mostly for variable framerate (VFR)
    *nb_frames_prev = 0;
    /* by default, we output a single frame */
    *nb_frames = 1;

    if (delta0 < 0 &&
        delta > 0 &&
        ost->vsync_method != VSYNC_PASSTHROUGH
#if FFMPEG_OPT_VSYNC_DROP
        && ost->vsync_method != VSYNC_DROP
#endif
        ) {
        if (delta0 < -0.6) {
            av_log(ost, AV_LOG_VERBOSE, "Past duration %f too large\n", -delta0);
        } else
            av_log(ost, AV_LOG_DEBUG, "Clipping frame in rate conversion by %f\n", -delta0);
        sync_ipts = ofp->next_pts;
        duration += delta0;
        delta0 = 0;
    }

    switch (ost->vsync_method) {
    case VSYNC_VSCFR:
        if (fps->frame_number == 0 && delta0 >= 0.5) {
            av_log(ost, AV_LOG_DEBUG, "Not duplicating %d initial frames\n", (int)lrintf(delta0));
            delta = duration;
            delta0 = 0;
            ofp->next_pts = llrint(sync_ipts);
        }
    case VSYNC_CFR:
        // FIXME set to 0.5 after we fix some dts/pts bugs like in 
        if (frame_drop_threshold && delta < frame_drop_threshold && fps->frame_number) {
            *nb_frames = 0;
        } else if (delta < -1.1)
            *nb_frames = 0;
        else if (delta > 1.1) {
            *nb_frames = llrintf(delta);//discard four, but treat five as whole (of decimal points)，delta>=1.5 Then output two frames。
            if (delta0 > 1.1)
                *nb_frames_prev = llrintf(delta0 - 0.6);
        }
        frame->duration = 1;
        break;
    case VSYNC_VFR:
        if (delta <= -0.6)
            *nb_frames = 0;
        else if (delta > 0.6)
            ofp->next_pts = llrint(sync_ipts);
        frame->duration = llrint(duration);
        break;
#if FFMPEG_OPT_VSYNC_DROP
    case VSYNC_DROP:
#endif
    case VSYNC_PASSTHROUGH:
        ofp->next_pts = llrint(sync_ipts);
        frame->duration = llrint(duration);
        break;
    default:
        av_assert0(0);
    }

finish:
    memmove(fps->frames_prev_hist + 1,
            fps->frames_prev_hist,
            sizeof(fps->frames_prev_hist[0]) * (FF_ARRAY_ELEMS(fps->frames_prev_hist) - 1));
    fps->frames_prev_hist[0] = *nb_frames_prev;

    if (*nb_frames_prev == 0 && fps->last_dropped) {
        atomic_fetch_add(&ofilter->nb_frames_drop, 1);
        av_log(ost, AV_LOG_VERBOSE,
               "*** dropping frame %"PRId64" at ts %"PRId64"\n",
               fps->frame_number, fps->last_frame->pts);
    }
    if (*nb_frames > (*nb_frames_prev && fps->last_dropped) + (*nb_frames > *nb_frames_prev)) {
        uint64_t nb_frames_dup;
        if (*nb_frames > dts_error_threshold * 30) {
            av_log(ost, AV_LOG_ERROR, "%"PRId64" frame duplication too large, skipping\n", *nb_frames - 1);
            atomic_fetch_add(&ofilter->nb_frames_drop, 1);
            *nb_frames = 0;
            return;
        }
        nb_frames_dup = atomic_fetch_add(&ofilter->nb_frames_dup,
                                         *nb_frames - (*nb_frames_prev && fps->last_dropped) - (*nb_frames > *nb_frames_prev));
        av_log(ost, AV_LOG_VERBOSE, "*** %"PRId64" dup!\n", *nb_frames - 1);
        if (nb_frames_dup > fps->dup_warning) {
            av_log(ost, AV_LOG_WARNING, "More than %"PRIu64" frames duplicated\n", fps->dup_warning);
            fps->dup_warning *= 10;
        }
    }

    fps->last_dropped = *nb_frames == *nb_frames_prev && frame;
    fps->dropped_keyframe |= fps->last_dropped && (frame->flags & AV_FRAME_FLAG_KEY);
}




static double adjust_frame_pts_to_encoder_tb(AVFrame *frame, AVRational tb_dst,
                                             int64_t start_time)
{
    double float_pts = AV_NOPTS_VALUE; // this is identical to  but with higher precision

    AVRational        tb = tb_dst;
    AVRational filter_tb = frame->time_base;
    const int extra_bits = av_clip(29 - av_log2(), 0, 16);

    if (frame->pts == AV_NOPTS_VALUE)
        goto early_exit;
    //To improve the accuracy of the timestamp，Shrink the time base of the output timestamp2(used form a nominal expression)extra_bitscube (math.),输出(used form a nominal expression)时间戳就会增大2(used form a nominal expression)extra_bitscube (math.)（av_rescale_q(used form a nominal expression)输出是int64_t）。
     <<= extra_bits;
    // Timestamp the current frame（The time base is1/1000）Converted to output stream time base。
    float_pts = av_rescale_q(frame->pts, filter_tb, tb) -
                av_rescale_q(start_time, AV_TIME_BASE_Q, tb);
    //When you're done with the calculations, you get a whole number and then shrink it.2(used form a nominal expression)extra_bitscube (math.)得到浮点型(used form a nominal expression)值。
    float_pts /= 1 << extra_bits;
    // when float_pts is not exactly an integer,
    // avoid exact midpoints to reduce the chance of rounding differences, this
    // can be removed in case the fps code is changed to work with integers
    if (float_pts != llrint(float_pts))
        float_pts += FFSIGN(float_pts) * 1.0 / (1<<17);

    frame->pts = av_rescale_q(frame->pts, filter_tb, tb_dst) -
                 av_rescale_q(start_time, AV_TIME_BASE_Q, tb_dst);
    frame->time_base = tb_dst;

early_exit:

    if (debug_ts) {
        av_log(NULL, AV_LOG_INFO, "filter -> pts:%s pts_time:%s exact:%f time_base:%d/%d\n",
               frame ? av_ts2str(frame->pts) : "NULL",
               av_ts2timestr(frame->pts, &tb_dst),
               float_pts, tb_dst.num, tb_dst.den);
    }

    return float_pts;
}

Call stack:

#0  video_sync_process (nb_frames_prev=<synthetic pointer>, nb_frames=<synthetic pointer>, frame=0x7fffdc0008c0, ofp=0x311cc00) at fftools/    ffmpeg_filter.c:2068
#1  fg_output_frame (ofp=ofp@entry=0x311cc00, fgt=fgt@entry=0x7fffe6198810, frame=frame@entry=0x7fffdc0008c0) at fftools/ffmpeg_filter.c:2230
#2  0x00000000004a84c3 in fg_output_step (frame=0x7fffdc0008c0, fgt=0x7fffe6198810, ofp=0x311cc00) at fftools/ffmpeg_filter.c:2371
#3  read_frames (fg=fg@entry=0x2fa6e80, fgt=fgt@entry=0x7fffe6198810, frame=0x7fffdc0008c0) at fftools/ffmpeg_filter.c:2432
#4  0x00000000004a873d in filter_thread (arg=0x2fa6e80) at fftools/ffmpeg_filter.c:2846
#5  0x00000000004b85e9 in task_wrapper (arg=0x3fb5370) at fftools/ffmpeg_sched.c:2447
#6  0x00007ffff60b4ea5 in start_thread () from /usr/lib64/.0
#7  0x00007ffff4dedb0d in clone () from /usr/lib64/.6

After changing the time base of the timestamp inside the filter, you can pass the pts to the encoder for encoding.

#0  sch_filter_send (sch=0x2f92700, fg_idx=1, out_idx=0, frame=frame@entry=0x7fffd00008c0) at fftools/ffmpeg_sched.c:2390
#1  0x00000000004a7c8a in fg_output_frame (ofp=ofp@entry=0x3f7e840, fgt=fgt@entry=0x7fffe595e810, frame=frame@entry=0x7fffd00008c0) at fftools/    ffmpeg_filter.c:2269
#2  0x00000000004a84c3 in fg_output_step (frame=0x7fffd00008c0, fgt=0x7fffe595e810, ofp=0x3f7e840) at fftools/ffmpeg_filter.c:2371
#3  read_frames (fg=fg@entry=0x3f7dd40, fgt=fgt@entry=0x7fffe595e810, frame=0x7fffd00008c0) at fftools/ffmpeg_filter.c:2432
#4  0x00000000004a873d in filter_thread (arg=0x3f7dd40) at fftools/ffmpeg_filter.c:2846
#5  0x00000000004b85e9 in task_wrapper (arg=0x3f7eae0) at fftools/ffmpeg_sched.c:2453
#6  0x00007ffff60b4ea5 in start_thread () from /usr/lib64/.0
#7  0x00007ffff4dedb0d in clone () from /usr/lib64/.6

Setting the timestamp in the encoder

Timestamps set in the encoder, timebase still uses 1/output_fps, dts increases by 1 for each outgoing frame, if there is a B-frame, the encoder will set the correct pts for you.

static int X264_frame(AVCodecContext *ctx, AVPacket *pkt, const AVFrame *frame,
                      int *got_packet)
{
    ...
    pkt->pts = pic_out.i_pts;
    pkt->dts = pic_out.i_dts;
    ...
   
    return 0;
}

call stack:

#0  X264_frame (ctx=0x2fbc140, pkt=0x7fffdc012780, frame=0x7fffdc014b00, got_packet=0x7fffe71619bc) at libavcodec/:677
#1  0x00000000008737ae in ff_encode_encode_cb (avctx=0x2fbc140, avpkt=0x7fffdc012780, frame=0x7fffdc014b00, got_packet=0x7fffe71619bc) at libavcodec/    :253
#2  0x0000000000873b0c in encode_simple_internal (avpkt=0x7fffdc012780, avctx=0x2fbc140) at libavcodec/:339
#3  encode_simple_receive_packet (avpkt=<optimized out>, avctx=<optimized out>) at libavcodec/:353
#4  encode_receive_packet_internal (avctx=avctx@entry=0x2fbc140, avpkt=0x7fffdc012780) at libavcodec/:387
#5  0x0000000000873d98 in avcodec_send_frame (avctx=avctx@entry=0x2fbc140, frame=frame@entry=0x7fffe00008c0) at libavcodec/:530
#6  0x00000000004a49af in encode_frame (of=0x2fbf4c0, pkt=0x7fffe0000b40, frame=0x7fffe00008c0, ost=0x2fa0e40) at fftools/ffmpeg_enc.c:675
#7  frame_encode (ost=ost@entry=0x2fa0e40, frame=0x7fffe00008c0, pkt=0x7fffe0000b40) at fftools/ffmpeg_enc.c:843
#8  0x00000000004a5412 in encoder_thread (arg=0x2fa0e40) at fftools/ffmpeg_enc.c:929
#9  0x00000000004b85e7 in task_wrapper (arg=0x3f83308) at fftools/ffmpeg_sched.c:2447
#10 0x00007ffff60b4ea5 in start_thread () from /usr/lib64/.0
#11 0x00007ffff4dedb0d in clone () from /usr/lib64/.6

Modify the timebase to the timebase of the output stream protocol in the wrapper thread

static int write_packet(Muxer *mux, OutputStream *ost, AVPacket *pkt)
{
    MuxStream *ms = ms_from_ost(ost);
    AVFormatContext *s = mux->fc;
    int64_t fs;
    uint64_t frame_num;
    int ret;

    fs = filesize(s->pb);
    atomic_store(&mux->last_filesize, fs);
    if (fs >= mux->limit_filesize) {
        ret = AVERROR_EOF;
        goto fail;
    }
    //The time base conversion is performed in the following function，Here it is.1/out_fps convert 1/1000
    ret = mux_fixup_ts(mux, ms, pkt);
    if (ret < 0)
        goto fail;

    ms->data_size_mux += pkt->size;
    frame_num = atomic_fetch_add(&ost->packets_written, 1);

    pkt->stream_index = ost->index;

    if (ms->)
        enc_stats_write(ost, &ms->stats, NULL, pkt, frame_num);

    ret = av_interleaved_write_frame(s, pkt);
    if (ret < 0) {
        av_log(ost, AV_LOG_ERROR,
               "Error submitting a packet to the muxer: %s\n",
               av_err2str(ret));
        goto fail;
    }

    return 0;
fail:
    av_packet_unref(pkt);
    return ret;
}




#0 write_packet (mux=<optimized out>, ost=0x3f8f640, pkt=0x7fff480008c0) at fftools/ffmpeg_mux.c:228
#1 0x00000000004aadf3 in mux_packet_filter (mt=<optimized out>, stream_eof=<optimized out>, pkt=<optimized out>, ost=<optimized out>, mux=<optimized out>) at fftools/ffmpeg_mux.c:357
#2 muxer_thread (arg=0x30a4000) at fftools/ffmpeg_mux.c:438
#3 0x00000000004b858d in task_wrapper (arg=0x2fb7760) at fftools/ffmpeg_sched.c:2447
#4 0x00007ffff60b4ea5 in start_thread () from /usr/lib64/.0
#5 0x00007ffff4dedb0d in clone () from /usr/lib64/.6