* ASR with LFM2-Audio-1.5B * Set rope_theta * Fix comment * Remove rope_theta setting * Address PR feedback * rename functions to conformer * remove some redundant ggml_cont * fix missing tensor * add prefix "a." for conv tensors * remove redundant reshape * clean up * add test model --------- Co-authored-by: Tarek Dakhran <tarek@liquid.ai>
40 lines
1.2 KiB
C++
40 lines
1.2 KiB
C++
#pragma once
|
|
|
|
#include "ggml.h"
|
|
#include "clip-model.h"
|
|
|
|
#include <cstdint>
|
|
#include <vector>
|
|
#include <string>
|
|
|
|
#define MTMD_INTERNAL_HEADER
|
|
|
|
struct mtmd_audio_mel {
|
|
int n_len;
|
|
int n_len_org;
|
|
int n_mel;
|
|
|
|
std::vector<float> data;
|
|
};
|
|
|
|
struct mtmd_audio_preprocessor {
|
|
const clip_hparams & hparams;
|
|
|
|
mtmd_audio_preprocessor(const clip_ctx * ctx): hparams(*clip_get_hparams(ctx)) {}
|
|
|
|
virtual ~mtmd_audio_preprocessor() = default;
|
|
virtual void initialize() = 0; // NOT thread-safe
|
|
virtual bool preprocess(const float * samples, size_t n_samples, std::vector<mtmd_audio_mel> & output) = 0;
|
|
};
|
|
|
|
struct mtmd_audio_preprocessor_whisper : mtmd_audio_preprocessor {
|
|
mtmd_audio_preprocessor_whisper(const clip_ctx * ctx) : mtmd_audio_preprocessor(ctx) {}
|
|
void initialize() override;
|
|
bool preprocess(const float * samples, size_t n_samples, std::vector<mtmd_audio_mel> & output) override;
|
|
};
|
|
|
|
struct mtmd_audio_preprocessor_conformer : mtmd_audio_preprocessor {
|
|
mtmd_audio_preprocessor_conformer(const clip_ctx * ctx) : mtmd_audio_preprocessor(ctx) {}
|
|
void initialize() override;
|
|
bool preprocess(const float * samples, size_t n_samples, std::vector<mtmd_audio_mel> & output) override;
|
|
};
|