context : use n_embd_out for pooled embedding extraction (#20840)
The MEAN/CLS/LAST pooling paths in encode() and decode() used n_embd_inp() (16384 for qwen3vl with deepstack) to read from the pooled embedding tensor, which only has n_embd_out() (4096) floats per sequence. This caused a tensor read out of bounds assertion. Fixes embedding mode for Qwen3-VL-Embedding models.
This commit is contained in:
parent
568aec82d2
commit
212f4521b0
1 changed files with 11 additions and 4 deletions
|
|
@ -1347,8 +1347,11 @@ int llama_context::encode(const llama_batch & batch_inp) {
|
||||||
const llama_seq_id seq_id = ubatch.seq_id_unq[s];
|
const llama_seq_id seq_id = ubatch.seq_id_unq[s];
|
||||||
const int32_t seq_idx = ubatch.seq_idx[seq_id];
|
const int32_t seq_idx = ubatch.seq_idx[seq_id];
|
||||||
|
|
||||||
embd_seq_out[seq_id].resize(n_embd);
|
// use n_embd_out (not n_embd_inp) - the pooled embedding has the model's
|
||||||
ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_embd*seq_idx)*sizeof(float), n_embd*sizeof(float));
|
// output dimension, which differs from input dimension for deepstack models (e.g. qwen3vl)
|
||||||
|
const uint32_t n_embd_out = hparams.n_embd_out();
|
||||||
|
embd_seq_out[seq_id].resize(n_embd_out);
|
||||||
|
ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_embd_out*seq_idx)*sizeof(float), n_embd_out*sizeof(float));
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
case LLAMA_POOLING_TYPE_RANK:
|
case LLAMA_POOLING_TYPE_RANK:
|
||||||
|
|
@ -1769,12 +1772,16 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
||||||
// extract sequence embeddings (cleared before processing each batch)
|
// extract sequence embeddings (cleared before processing each batch)
|
||||||
auto & embd_seq_out = embd_seq;
|
auto & embd_seq_out = embd_seq;
|
||||||
|
|
||||||
|
// use n_embd_out (not n_embd_inp) - the pooled embedding has the model's
|
||||||
|
// output dimension, which differs from input dimension for deepstack models (e.g. qwen3vl)
|
||||||
|
const uint32_t n_embd_out = hparams.n_embd_out();
|
||||||
|
|
||||||
for (uint32_t s = 0; s < ubatch.n_seqs_unq; ++s) {
|
for (uint32_t s = 0; s < ubatch.n_seqs_unq; ++s) {
|
||||||
const llama_seq_id seq_id = ubatch.seq_id_unq[s];
|
const llama_seq_id seq_id = ubatch.seq_id_unq[s];
|
||||||
const int32_t seq_idx = ubatch.seq_idx[seq_id];
|
const int32_t seq_idx = ubatch.seq_idx[seq_id];
|
||||||
|
|
||||||
embd_seq_out[seq_id].resize(n_embd);
|
embd_seq_out[seq_id].resize(n_embd_out);
|
||||||
ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_embd*seq_idx)*sizeof(float), n_embd*sizeof(float));
|
ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_embd_out*seq_idx)*sizeof(float), n_embd_out*sizeof(float));
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
case LLAMA_POOLING_TYPE_RANK:
|
case LLAMA_POOLING_TYPE_RANK:
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue