llama-cpp-turboquant/tools/mtmd/mtmd-audio.h
Xuan-Son Nguyen 8ea958d4d9
model : add ASR support for LFM2-Audio-1.5B (conformer) (#18106)
* ASR with LFM2-Audio-1.5B

* Set rope_theta

* Fix comment

* Remove rope_theta setting

* Address PR feedback

* rename functions to conformer

* remove some redundant ggml_cont

* fix missing tensor

* add prefix "a." for conv tensors

* remove redundant reshape

* clean up

* add test model

---------

Co-authored-by: Tarek Dakhran <tarek@liquid.ai>
2025-12-19 00:18:01 +01:00

40 lines
1.2 KiB
C++

#pragma once
#include "ggml.h"
#include "clip-model.h"
#include <cstdint>
#include <vector>
#include <string>
#define MTMD_INTERNAL_HEADER
struct mtmd_audio_mel {
int n_len;
int n_len_org;
int n_mel;
std::vector<float> data;
};
struct mtmd_audio_preprocessor {
const clip_hparams & hparams;
mtmd_audio_preprocessor(const clip_ctx * ctx): hparams(*clip_get_hparams(ctx)) {}
virtual ~mtmd_audio_preprocessor() = default;
virtual void initialize() = 0; // NOT thread-safe
virtual bool preprocess(const float * samples, size_t n_samples, std::vector<mtmd_audio_mel> & output) = 0;
};
struct mtmd_audio_preprocessor_whisper : mtmd_audio_preprocessor {
mtmd_audio_preprocessor_whisper(const clip_ctx * ctx) : mtmd_audio_preprocessor(ctx) {}
void initialize() override;
bool preprocess(const float * samples, size_t n_samples, std::vector<mtmd_audio_mel> & output) override;
};
struct mtmd_audio_preprocessor_conformer : mtmd_audio_preprocessor {
mtmd_audio_preprocessor_conformer(const clip_ctx * ctx) : mtmd_audio_preprocessor(ctx) {}
void initialize() override;
bool preprocess(const float * samples, size_t n_samples, std::vector<mtmd_audio_mel> & output) override;
};