chore : correct typos [no ci] (#20041)

* fix(docs): correct typos found during code review

Non-functional changes only:
- Fixed minor spelling mistakes in comments
- Corrected typos in user-facing strings
- No variables, logic, or functional code was modified.

Signed-off-by: Marcel Petrick <mail@marcelpetrick.it>

* Update docs/backend/CANN.md

Co-authored-by: Aaron Teo <taronaeo@gmail.com>

* Revert "Auxiliary commit to revert individual files from 846d1c301281178efbc6ce6060ad34c1ebe45af8"

This reverts commit 02fcf0c7db661d5ff3eff96b2b2db9fdb7213256.

* Update tests/test-backend-ops.cpp

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Update tests/test-backend-ops.cpp

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

---------

Signed-off-by: Marcel Petrick <mail@marcelpetrick.it>
Co-authored-by: Aaron Teo <taronaeo@gmail.com>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
This commit is contained in:
Marcel Petrick 2026-03-05 08:50:21 +01:00 committed by GitHub
parent 7a99dc85e2
commit 92f7da00b4
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
81 changed files with 160 additions and 160 deletions

View file

@ -259,7 +259,7 @@ extern "C" {
Example usage:
// operations that use tensors allocated in a buffer with USAGE_WEIGHTS will be assigned
// preferrably to run on the same backend as the buffer
// preferably to run on the same backend as the buffer
ggml_backend_buffer_set_usage(buf_weights, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, NULL, num_backends, GGML_DEFAULT_GRAPH_SIZE, false, true);

View file

@ -138,7 +138,7 @@ extern "C" {
GGML_API ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params);
GGML_API void ggml_opt_free(ggml_opt_context_t opt_ctx);
// set gradients to zero, initilize loss, and optionally reset the optimizer
// set gradients to zero, initialize loss, and optionally reset the optimizer
GGML_API void ggml_opt_reset(ggml_opt_context_t opt_ctx, bool optimizer);
GGML_API bool ggml_opt_static_graphs(ggml_opt_context_t opt_ctx); // whether the graphs are allocated_statically

View file

@ -2575,7 +2575,7 @@ extern "C" {
struct ggml_tensor * grad,
struct ggml_tensor * sgd_params); // alpha, weight decay
// build forward mutiple tensors and select one of them for computing
// build forward multiple tensors and select one of them for computing
// this is useful for creating graphs that have constant topology but compute different things based on the input
// ref: https://github.com/ggml-org/llama.cpp/pull/18550
//

View file

@ -195,7 +195,7 @@ struct tile_config_t{
// will be needed.
//
// Here another commonly used pattern 1-3-3 is skipped, as it is mostly used when m <=16;
// and the sinlge batch gemm (m=1) has a special fast path with `avx512-vnni`.
// and the single batch gemm (m=1) has a special fast path with `avx512-vnni`.
//
// ref: https://www.intel.com/content/www/us/en/developer/articles/code-sample/
// advanced-matrix-extensions-intrinsics-functions.html
@ -1379,8 +1379,8 @@ struct tinygemm_kernel_vnni<block_q8_0, block_q4_0, float, BLOCK_M, BLOCK_N, BLO
// sum of offsets, shared across COLS
//
// avx512-vnni does not have `_mm512_dpbssd_epi32`,
// need to transfrom ss to us:
// a * (b - 8) is equavilent to b * a - 8 * a
// need to transform ss to us:
// a * (b - 8) is equivalent to b * a - 8 * a
// s u u u s u s
//
__m512i vcomp;

View file

@ -968,7 +968,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
const int vector_length = ggml_cpu_get_sve_cnt()*8;
//VLA Implemenation for SVE
//VLA Implementation for SVE
switch (vector_length) {
case 128:
{

View file

@ -781,7 +781,7 @@ void ggml_gemv_q4_K_8x8_q8_K(int n,
const uint8_t * q4_base = q4_ptr[b].qs + sb * QK_K;
// Load the 64 quants from q8K duplicated to use vecdots with the interelaved columns
// Load the 64 quants from q8K duplicated to use vecdots with the interleaved columns
// but still need the qs to use the low and hi bits from q4
const int8_t * q8_base = q8_ptr[b].qs + sb * 64;
int8x16_t q8_qs[8];
@ -3796,7 +3796,7 @@ void ggml_gemm_q4_K_8x8_q8_K(int n,
for (int b = 0; b < nb; b++) {
// bsums pairs belongs to the same q8_k subblock
// 64 elemnts loaded and made sum of 0-7 and 8-15 sum || 16-23 and 24 - 31 sum
// 64 elements loaded and made sum of 0-7 and 8-15 sum || 16-23 and 24 - 31 sum
const int16x8_t bsums[4]{
vpaddq_s16(vld1q_s16(q8_ptr[b].bsums + 16 * 0), vld1q_s16(q8_ptr[b].bsums + 16 * 0 + 8)),
vpaddq_s16(vld1q_s16(q8_ptr[b].bsums + 16 * 1), vld1q_s16(q8_ptr[b].bsums + 16 * 1 + 8)),

View file

@ -423,7 +423,7 @@ void ggml_quantize_mat_q8_K_4x8(const float * GGML_RESTRICT x, void * GGML_RESTR
quants_interleaved[j] = i0;
}
// Masks to shuffle the quants of corresonding sub blocks for rearraning quants for vectorized bsums computation
// Masks to shuffle the quants of corresponding sub blocks for rearranging quants for vectorized bsums computation
__m256i shuffle_mask_sb2 = _mm256_castsi128_si256(_mm_setr_epi8(0, 1, 0, 1, 4, 5, 6, 7, 8, 9, 8, 9, 12, 13, 14, 15));
shuffle_mask_sb2 = _mm256_permute2f128_si256(shuffle_mask_sb2, shuffle_mask_sb2, 0);
__m256i shuffle_mask_sb3 = _mm256_castsi128_si256(_mm_setr_epi8(0, 1, 2, 3, 0, 1, 6, 7, 8, 9, 10, 11, 8, 9, 14, 15));
@ -625,7 +625,7 @@ static void gemv_q4_b32_8x8_q8_0_lut_avx(int n, float * GGML_RESTRICT s, size_t
iacc = mul_sum_i8_pairs_acc_int32x8(iacc, _mm256_blend_epi32(rhs_vec_0123_3 ,_mm256_shuffle_epi32(rhs_vec_4567_3, 177), 170), _mm256_shuffle_epi32(lhs_vec_1, 170));
iacc = mul_sum_i8_pairs_acc_int32x8(iacc, _mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_3, 177) ,rhs_vec_4567_3, 170), _mm256_shuffle_epi32(lhs_vec_1, 255));
// Accumulated values multipled with appropriate scales
// Accumulated values multiplied with appropriate scales
acc_row = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc), _mm256_mul_ps(col_scale_f32, row_scale_f32), acc_row);
}
@ -868,7 +868,7 @@ static void gemm_q4_b32_8x8_q8_0_lut_avx(int n, float * GGML_RESTRICT s, size_t
const __m128i row_scale_f16 = _mm_shuffle_epi32(_mm_maskload_epi32((int const*)(a_ptrs[rp][b].d), loadMask), 68);
const __m512 row_scale_f32 = GGML_F32Cx16_REPEAT_LOAD(row_scale_f16);
// Multiply with appropiate scales and accumulate
// Multiply with appropriate scales and accumulate
acc_rows[rp * 4] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_0), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[rp * 4]);
acc_rows[rp * 4 + 1] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_1), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[rp * 4 + 1]);
acc_rows[rp * 4 + 2] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_2), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[rp * 4 + 2]);
@ -1076,7 +1076,7 @@ static void gemm_q4_b32_8x8_q8_0_lut_avx(int n, float * GGML_RESTRICT s, size_t
const __m128i row_scale_f16 = _mm_shuffle_epi32(_mm_maskload_epi32((int const*)(a_ptr[b].d), loadMask), 68);
const __m512 row_scale_f32 = GGML_F32Cx16_REPEAT_LOAD(row_scale_f16);
// Multiply with appropiate scales and accumulate
// Multiply with appropriate scales and accumulate
acc_rows[0] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_0), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[0]);
acc_rows[1] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_1), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[1]);
acc_rows[2] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_2), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[2]);
@ -1257,7 +1257,7 @@ static void gemm_q4_b32_8x8_q8_0_lut_avx(int n, float * GGML_RESTRICT s, size_t
// Load the scale(d) values for all the 4 Q8_0 blocks and repeat it across lanes
const __m256 row_scale_f32 = GGML_F32Cx8_REPEAT_LOAD(a_ptrs[rp][b].d, loadMask);
// Multiply with appropiate scales and accumulate
// Multiply with appropriate scales and accumulate
acc_rows[rp * 4] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_0), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[rp * 4]);
acc_rows[rp * 4 + 1] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_1), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[rp * 4 + 1]);
acc_rows[rp * 4 + 2] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_2), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[rp * 4 + 2]);
@ -1428,7 +1428,7 @@ static void gemm_q4_b32_8x8_q8_0_lut_avx(int n, float * GGML_RESTRICT s, size_t
// Load the scale(d) values for all the 4 Q8_0 blocks and repeat it across lanes
const __m256 row_scale_f32 = GGML_F32Cx8_REPEAT_LOAD(a_ptr[b].d, loadMask);
// Multiply with appropiate scales and accumulate
// Multiply with appropriate scales and accumulate
acc_rows[0] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_0), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[0]);
acc_rows[1] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_1), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[1]);
acc_rows[2] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_2), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[2]);
@ -1612,7 +1612,7 @@ void ggml_gemv_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
lhs_vec_11 = _mm256_permute2f128_si256(lhs_vec_11, lhs_vec_11, 0);
// Dot product done within 32 bit lanes and accumulated in the same vector
// First done for first sub block and thenn for second sub block in each sb
// First done for first sub block and then for second sub block in each sb
// B0(0-3) B4(0-3) B1(0-3) B5(0-3) B2(0-3) B6(0-3) B3(0-3) B7(0-3) with A0(0-3)
// B0(4-7) B4(4-7) B1(4-7) B5(4-7) B2(4-7) B6(4-7) B3(4-7) B7(4-7) with A0(4-7)
// ...........................................................................
@ -2422,7 +2422,7 @@ void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
const __m256 row_scale_f32_ymm = _mm256_set_m128(row_scale_f32_sse, row_scale_f32_sse);
const __m512 row_scale_f32 = _mm512_insertf32x8(_mm512_castps256_ps512(row_scale_f32_ymm), row_scale_f32_ymm, 1);
// Multiply with appropiate scales and accumulate (for both d and dmin) below
// Multiply with appropriate scales and accumulate (for both d and dmin) below
acc_rows[rp * 4] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_0), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[rp * 4]);
acc_rows[rp * 4 + 1] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_1), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[rp * 4 + 1]);
acc_rows[rp * 4 + 2] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_2), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[rp * 4 + 2]);
@ -2785,7 +2785,7 @@ void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
const __m256 row_scale_f32_ymm = _mm256_set_m128(row_scale_f32_sse, row_scale_f32_sse);
const __m512 row_scale_f32 = _mm512_insertf32x8(_mm512_castps256_ps512(row_scale_f32_ymm), row_scale_f32_ymm, 1);
// Multiply with appropiate scales and accumulate (for both d and dmin) below
// Multiply with appropriate scales and accumulate (for both d and dmin) below
acc_rows[0] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_0), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[0]);
acc_rows[1] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_1), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[1]);
acc_rows[2] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_2), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[2]);
@ -2802,7 +2802,7 @@ void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
acc_min_rows[3] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_min_3), _mm512_mul_ps(col_dmin_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 255)), acc_min_rows[3]);
}
}
// Store accumlated values
// Store accumulated values
for (int i = 0; i < 4; i++) {
_mm512_storeu_ps((float * )(s + ((y * 4 + i) * bs + x * 8)), _mm512_sub_ps(acc_rows[i], acc_min_rows[i]));
}
@ -3130,7 +3130,7 @@ void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
const __m128 row_scale_f32_sse = _mm_load_ps(a_ptrs[rp][b].d);
const __m256 row_scale_f32 = _mm256_set_m128(row_scale_f32_sse, row_scale_f32_sse);//GGML_F32Cx8_REPEAT_LOAD(a_ptrs[rp][b].d, loadMask);
// Multiply with appropiate scales and accumulate (for both d and dmin) below
// Multiply with appropriate scales and accumulate (for both d and dmin) below
acc_rows[rp * 4] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_0), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[rp * 4]);
acc_rows[rp * 4 + 1] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_1), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[rp * 4 + 1]);
acc_rows[rp * 4 + 2] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_2), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[rp * 4 + 2]);
@ -3460,7 +3460,7 @@ void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
const __m128 row_scale_f32_sse = _mm_load_ps(a_ptr[b].d);
const __m256 row_scale_f32 = _mm256_set_m128(row_scale_f32_sse, row_scale_f32_sse); //GGML_F32Cx8_REPEAT_LOAD(a_ptrs[rp][b].d, loadMask);
// Multiply with appropiate scales and accumulate (for both d and dmin) below
// Multiply with appropriate scales and accumulate (for both d and dmin) below
acc_rows[0] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_0), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[0]);
acc_rows[1] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_1), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[1]);
acc_rows[2] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_2), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[2]);
@ -4268,7 +4268,7 @@ void ggml_gemm_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
const __m256 row_scale_f32_ymm = _mm256_set_m128(row_scale_f32_sse, row_scale_f32_sse);
const __m512 row_scale_f32 = _mm512_insertf32x8(_mm512_castps256_ps512(row_scale_f32_ymm), row_scale_f32_ymm, 1);
// Multiply with appropiate scales and accumulate (for both d and dmin) below
// Multiply with appropriate scales and accumulate (for both d and dmin) below
acc_rows[rp * 4] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_0), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[rp * 4]);
acc_rows[rp * 4 + 1] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_1), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[rp * 4 + 1]);
acc_rows[rp * 4 + 2] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_2), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[rp * 4 + 2]);
@ -5035,7 +5035,7 @@ void ggml_gemm_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
acc_min_rows[3] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_min_3), _mm512_mul_ps(col_dmin_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 255)), acc_min_rows[3]);
}
}
// Store accumlated values
// Store accumulated values
for (int i = 0; i < 4; i++) {
_mm512_storeu_ps((float * )(s + ((y * 4 + i) * bs + x * 8)), _mm512_sub_ps(acc_rows[i], acc_min_rows[i]));
}
@ -5677,7 +5677,7 @@ void ggml_gemm_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
const __m128 row_scale_f32_sse = _mm_load_ps(a_ptrs[rp][b].d);
const __m256 row_scale_f32 = _mm256_set_m128(row_scale_f32_sse, row_scale_f32_sse);
// Multiply with appropiate scales and accumulate (for both d and dmin) below
// Multiply with appropriate scales and accumulate (for both d and dmin) below
acc_rows[rp * 4] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_0), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[rp * 4]);
acc_rows[rp * 4 + 1] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_1), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[rp * 4 + 1]);
acc_rows[rp * 4 + 2] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_2), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[rp * 4 + 2]);
@ -6349,7 +6349,7 @@ void ggml_gemm_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
const __m128 row_scale_f32_sse = _mm_load_ps(a_ptr[b].d);
const __m256 row_scale_f32 = _mm256_set_m128(row_scale_f32_sse, row_scale_f32_sse);
// Multiply with appropiate scales and accumulate (for both d and dmin) below
// Multiply with appropriate scales and accumulate (for both d and dmin) below
acc_rows[0] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_0), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[0]);
acc_rows[1] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_1), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[1]);
acc_rows[2] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_2), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[2]);

View file

@ -2477,7 +2477,7 @@ static bool ggml_thread_apply_priority(int32_t prio) {
if (prio != GGML_SCHED_PRIO_LOW) {
// Tell Windows that this thread should not be throttled (needs its own CPU core).
// Newer Windows 11 versions aggresively park (offline) CPU cores and often place
// Newer Windows 11 versions aggressively park (offline) CPU cores and often place
// all our threads onto the first 4 cores which results in terrible performance with
// n_threads > 4
#if _WIN32_WINNT >= 0x0602

View file

@ -533,7 +533,7 @@ class tinyBLAS {
if constexpr (RN > 1) {
return mnpack<RM, RN-1, BM>(m, n, SIZE_N, BN);
} else {
GGML_LOG_ERROR("mnpack<%d, %d> bloc size not supported\n", RM, (int)SIZE_N);
GGML_LOG_ERROR("mnpack<%d, %d> block size not supported\n", RM, (int)SIZE_N);
GGML_ASSERT(false); // we have miss something.
}
}
@ -711,7 +711,7 @@ class tinyBLAS_RVV {
if constexpr (RN > 1) {
return mnpack<RM, RN-1, BM>(m, n, SIZE_N, BN);
} else {
GGML_LOG_ERROR("mnpack<%d, %d> bloc size not supported\n", RM, (int)SIZE_N);
GGML_LOG_ERROR("mnpack<%d, %d> block size not supported\n", RM, (int)SIZE_N);
GGML_ASSERT(false); // we have miss something.
}
}

View file

@ -375,7 +375,7 @@ static void ggml_compute_forward_dup_bytes(
const size_t rs = ne00 * type_size;
if (nb00 == type_size) {
// src0 is contigous on first dimension, copy by rows
// src0 is contiguous on first dimension, copy by rows
for (int64_t i03 = 0; i03 < ne03; i03++) {
for (int64_t i02 = 0; i02 < ne02; i02++) {
id += rs * ir0;
@ -1795,7 +1795,7 @@ void ggml_compute_forward_repeat(
{
ggml_compute_forward_repeat_f32(params, dst);
} break;
// TODO: templateify the implemenation and support for I64
// TODO: templateify the implementation and support for I64
// ref https://github.com/ggml-org/llama.cpp/pull/14274#discussion_r2169492225
//case GGML_TYPE_I64:
// {

View file

@ -3032,7 +3032,7 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
case GGML_OP_MUL_MAT_ID:
{
size = ggml_row_size(PARAM_TYPE, ggml_nelements(op->src[1]));
size = GGML_PAD(size, sizeof(int64_t)); // + padding for next bloc.
size = GGML_PAD(size, sizeof(int64_t)); // + padding for next block.
const int64_t ne02 = op->src[0]->ne[2]; // n_as, n_expert
const int64_t ne12 = op->src[1]->ne[2]; // n_tokens
@ -3297,7 +3297,7 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
auto * wdata = (char *)params->wdata;
auto * wdata_src1_end = (char *)wdata + GGML_PAD(nbw3, sizeof(int64_t));
// total of [n_as][ne12 + 1] elemets of type mmid_row_mapping (2*int32_t = int64_t)
// total of [n_as][ne12 + 1] elements of type mmid_row_mapping (2*int32_t = int64_t)
auto * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as]
struct mmid_row_mapping * matrix_rows = (struct mmid_row_mapping *) (matrix_row_counts + n_as); // [n_as][ne12]

View file

@ -1215,7 +1215,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
}
// If attention sinks are used, potentially re-scale if KQ_max is small.
// Also add the sink as a value to KQ_rowsum, this is done after synchonization of KQ_rowsum
// Also add the sink as a value to KQ_rowsum, this is done after synchronization of KQ_rowsum
// so it's being done unconditionally for every thread.
if (!is_fixup && (np == 1 || threadIdx.y % np == 0) && sinks_f) {
float KQ_max_scale[cols_per_thread];

View file

@ -10,7 +10,7 @@ static constexpr __device__ int ggml_cuda_fattn_vec_get_nthreads_device() {
return 128;
}
// Currenlty llvm with the amdgcn target does not support unrolling loops
// Currently llvm with the amdgcn target does not support unrolling loops
// that contain a break that can not be resolved at compile time.
#ifdef __clang__
#pragma clang diagnostic push

View file

@ -18,7 +18,7 @@
#if defined(RDNA4) && ROCWMMA_VERSION_MAJOR > 1
#define GGML_USE_WMMA_FATTN
#elif defined(RDNA4)
#warning "rocwmma fattn is not suported on RDNA4 on rocwmma < v2.0.0, expect degraded performance"
#warning "rocwmma fattn is not supported on RDNA4 on rocwmma < v2.0.0, expect degraded performance"
#endif // defined(RDNA4) && ROCWMMA_VERSION_MAJOR > 1
#endif // defined(GGML_HIP_ROCWMMA_FATTN)

View file

@ -3330,7 +3330,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph,
return false;
}
//rms_norm kernel assumes contigous rows
//rms_norm kernel assumes contiguous rows
if (!ggml_is_contiguous_rows(mul->src[0]) || !ggml_is_contiguous_rows(mul->src[1])) {
return false;
}

View file

@ -235,7 +235,7 @@ static __global__ void quantize_mmq_q8_1(
q.z = roundf(xi.z*d_inv);
q.w = roundf(xi.w*d_inv);
// Write back 4 int8 values as a single 32 bit value for better memroy bandwidth:
// Write back 4 int8 values as a single 32 bit value for better memory bandwidth:
char4 * yqs4 = (char4 *) y[ib].qs;
yqs4[iqs/4] = q;

View file

@ -46,7 +46,7 @@ struct soft_max_params {
};
// When ncols_template == 0 the bounds for the loops in this function are not known and can't be unrolled.
// As we want to keep pragma unroll for all other cases we supress the clang transformation warning here.
// As we want to keep pragma unroll for all other cases we suppress the clang transformation warning here.
#ifdef __clang__
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wpass-failed"

View file

@ -83,7 +83,7 @@ static void solve_tri_f32_cublas(ggml_backend_cuda_context & ctx,
// ======================
// When ncols_template == 0 the bounds for the loops in this function are not
// known and can't be unrolled. As we want to keep pragma unroll for all other
// cases we supress the clang transformation warning here.
// cases we suppress the clang transformation warning here.
#ifdef __clang__
# pragma clang diagnostic push
# pragma clang diagnostic ignored "-Wpass-failed"

View file

@ -139,7 +139,7 @@ struct ggml_hexagon_session {
};
void ggml_hexagon_session::enqueue(struct htp_general_req &req, struct dspqueue_buffer *bufs, uint32_t n_bufs, bool sync) {
// Bump pending flag (cleared in the session::flush once we get the responce)
// Bump pending flag (cleared in the session::flush once we get the response)
this->op_pending++; // atomic inc
int err = dspqueue_write(this->queue,
@ -443,7 +443,7 @@ static void repack_row_q4x4x2(uint8_t * y, const block_q4_0 * x, int64_t k) {
// Repack the scales
// Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_Q4_0x4x2)
// the last block is truncated and overriden by the scales.
// the last block is truncated and overridden by the scales.
for (int i = 0; i < nb; i++) {
// Repack the scales
ggml_half * d = (ggml_half *) (y_d + i * dblk_size);
@ -503,7 +503,7 @@ static void unpack_row_q4x4x2(block_q4_0 * x, const uint8_t * y, int64_t k) {
// Repack the scales
// Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_Q4_0x4x2)
// the last block is truncated and overriden by the scales.
// the last block is truncated and overridden by the scales.
for (int i = 0; i < nb; i++) {
// Unpack the scales
const ggml_half * d = (const ggml_half *) (y_d + i * dblk_size);
@ -552,7 +552,7 @@ static void init_row_q4x4x2(block_q4_0 * x, int64_t k) {
// Init the scales
// Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_Q4_0x4x2)
// the last block is truncated and overriden by the scales.
// the last block is truncated and overridden by the scales.
for (int i = 0; i < nb; i++) {
// Unpack the scales
x[i * 8 + 0].d = 0;
@ -770,7 +770,7 @@ static void repack_row_q8x4x2(uint8_t * y, const block_q8_0 * x, int64_t k) {
// Repack the scales
// Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_Q4_0x4x2)
// the last block is truncated and overriden by the scales.
// the last block is truncated and overridden by the scales.
for (int i = 0; i < nb; i++) {
// Repack the scales
ggml_half * d = (ggml_half *) (y_d + i * dblk_size);
@ -829,7 +829,7 @@ static void unpack_row_q8x4x2(block_q8_0 * x, const uint8_t * y, int64_t k) {
// Repack the scales
// Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_Q4_0x4x2)
// the last block is truncated and overriden by the scales.
// the last block is truncated and overridden by the scales.
for (int i = 0; i < nb; i++) {
// Unpack the scales
const ggml_half * d = (const ggml_half *) (y_d + i * dblk_size);
@ -878,7 +878,7 @@ static void init_row_q8x4x2(block_q8_0 * x, int64_t k) {
// Init the scales
// Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_Q8_0x4x2)
// the last block is truncated and overriden by the scales.
// the last block is truncated and overridden by the scales.
for (int i = 0; i < nb; i++) {
// Unpack the scales
x[i * 8 + 0].d = 0;
@ -1120,7 +1120,7 @@ static void repack_row_mxfp4x4x2(uint8_t * y, const block_mxfp4 * x, int64_t k)
// Repack the scales
// Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_MXFP4x4x2)
// the last block is truncated and overriden by the scales.
// the last block is truncated and overridden by the scales.
for (int i = 0; i < nb; i++) {
// Repack the scales
uint8_t * e = (uint8_t *) (y_e + i * eblk_size);
@ -1180,7 +1180,7 @@ static void unpack_row_mxfp4x4x2(block_mxfp4 * x, const uint8_t * y, int64_t k)
// Repack the scales
// Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_MXFP4_0x4x2)
// the last block is truncated and overriden by the scales.
// the last block is truncated and overridden by the scales.
for (int i = 0; i < nb; i++) {
// Unpack the scales
const uint8_t * e = (const uint8_t *) (y_e + i * eblk_size);
@ -1229,7 +1229,7 @@ static void init_row_mxfp4x4x2(block_mxfp4 * x, int64_t k) {
// Init the scales
// Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_MXFP4x4x2)
// the last block is truncated and overriden by the scales.
// the last block is truncated and overridden by the scales.
for (int i = 0; i < nb; i++) {
// Unpack the scales
x[i * 8 + 0].e = 0;
@ -2670,7 +2670,7 @@ static std::vector<int> ggml_hexagon_graph_optimize_reorder(const std::vector<no
// The main goal here is to stack the MUL_MAT ops with the same src1 input.
// This allows use to reuse dynamically quantized src1 in VTCM.
// TODO: the current version might do incorrect reodering in cases where quantized src0
// TODO: the current version might do incorrect reordering in cases where quantized src0
// input is an output of another Op.
for (int i0 = 0; i0 < n; i0++) {

View file

@ -282,7 +282,7 @@ static std::string get_driver_path() {
// Replace \SystemRoot with an absolute path from system ENV windir
const std::wstring systemRootEnv = L"windir";
// Query the number of wide charactors this variable requires
// Query the number of wide characters this variable requires
DWORD numWords = GetEnvironmentVariableW(systemRootEnv.c_str(), NULL, 0);
if (numWords == 0) {
GGML_LOG_ERROR("ggml-hex: Failed get systemRoot environment variable\n");

View file

@ -67,7 +67,7 @@ static inline HVX_Vector hvx_vec_inverse_f16(HVX_Vector vals) {
HVX_Vector vcl0 = Q6_Vuh_vcl0_Vuh(rm); //count leading zeros
// Get mantissa for 16-bit represenation
// Get mantissa for 16-bit representation
HVX_Vector mant_recip = Q6_V_vand_VV(Q6_Vh_vasr_VhR(Q6_Vh_vasl_VhVh(rm, vcl0), 5), Q6_Vh_vsplat_R(0x03FF));
//Compute Reciprocal Exponent

View file

@ -18,7 +18,7 @@
#include "htp-msg.h"
#include "htp-ops.h"
// Redefined the types GGML_ROPE_TYPE_NORMAL & GGML_ROPE_TYPE_NEOX as we cant include ggml.h
// Redefined the types GGML_ROPE_TYPE_NORMAL & GGML_ROPE_TYPE_NEOX as we can't include ggml.h
#define HTP_ROPE_TYPE_NORMAL 0
#define HTP_ROPE_TYPE_NEOX 2

View file

@ -56,7 +56,7 @@ static void worker_pool_main(void * context) {
unsigned int n = atomic_load(&pool->n_jobs);
unsigned int i = atomic_fetch_add(&pool->next_job, 1);
if (i >= n) {
// Spurios wakeup
// Spurious wakeup
continue;
}

View file

@ -1281,7 +1281,7 @@ struct ggml_metal_buffer {
bool use_residency_sets;
// optional MTLResidencySet
// note: cannot use explicity "id<MTLResidencySet>" here because it is not available on certain OSes
// note: cannot use explicitly "id<MTLResidencySet>" here because it is not available on certain OSes
id rset;
// pointers to global device

View file

@ -631,7 +631,7 @@ int ggml_metal_op_acc(ggml_metal_op_t ctx, int idx) {
const bool inplace = (bool) ((const int32_t *) op->op_params)[4];
if (!inplace) {
// run a separete kernel to cpy src->dst
// run a separate kernel to cpy src->dst
// not sure how to avoid this
// TODO: make a simpler cpy_bytes kernel
@ -1644,7 +1644,7 @@ int ggml_metal_op_set(ggml_metal_op_t ctx, int idx) {
const bool inplace = (bool) ((const int32_t *) op->op_params)[4];
if (!inplace) {
// run a separete kernel to cpy src->dst
// run a separate kernel to cpy src->dst
// not sure how to avoid this
// TODO: make a simpler cpy_bytes kernel
@ -2005,7 +2005,7 @@ int ggml_metal_op_mul_mat(ggml_metal_op_t ctx, int idx) {
const int16_t r0ptg = nypsg*nsg; // num src0 rows per threadgroup
int16_t r1ptg = 4; // num src1 rows per threadgroup
// note: not sure how optimal are those across all different hardware. there might be someting cleverer
// note: not sure how optimal are those across all different hardware. there might be something cleverer
switch (ne11) {
case 2:
r1ptg = 2; break;

View file

@ -14,7 +14,7 @@
#define GGML_METAL_MAX_DEVICES 16
// number of Metal devices
// note: can be overriden with GGML_METAL_DEVICES env to simulate virtual devices
// note: can be overridden with GGML_METAL_DEVICES env to simulate virtual devices
static int g_devices = 1;
////////////////////////////////////////////////////////////////////////////////

View file

@ -4218,7 +4218,7 @@ kernel void kernel_im2col(
template [[host_name("kernel_im2col_f32")]] kernel im2col_t kernel_im2col<float>;
template [[host_name("kernel_im2col_f16")]] kernel im2col_t kernel_im2col<half>;
// TODO: obolete -- remove
// TODO: obsolete -- remove
//typedef void (im2col_ext_t)(
// constant ggml_metal_kargs_im2col & args,
// device const float * x,

View file

@ -313,7 +313,7 @@ struct ProfilingInfo {
cl_ulong cmd_duration_ns;
// The time for the kernel to complete - COMPLETE - END
cl_ulong cmd_complete_duration_ns;
// Total time to finish the kernel - COMPELTE - QUEUED
// Total time to finish the kernel - COMPLETE - QUEUED
cl_ulong cmd_total_duration_ns;
// Global and local work sizes.
size_t global_size[3];
@ -2555,7 +2555,7 @@ static std::vector<ggml_backend_device> ggml_opencl_probe_devices(ggml_backend_r
cl_platform_id platform_ids[NPLAT];
if (clGetPlatformIDs(NPLAT, platform_ids, &n_platforms) != CL_SUCCESS) {
GGML_LOG_ERROR("ggml_opencl: plaform IDs not available.\n");
GGML_LOG_ERROR("ggml_opencl: platform IDs not available.\n");
return found_devices;
}
@ -3339,7 +3339,7 @@ static void ggml_backend_opencl_synchronize(ggml_backend_t backend) {
CL_CHECK(clReleaseEvent(evt));
}
// Syncronizes the 'backend_ctx's device with others so that commands
// Synchronizes the 'backend_ctx's device with others so that commands
// enqueued to it won't start until commands in the other devices have
// completed.
static void sync_with_other_backends(ggml_backend_opencl_context * backend_ctx) {
@ -3997,7 +3997,7 @@ struct ggml_backend_opencl_buffer_context {
// The buffer_context is initially created by ggml_backend_buft_alloc_buffer
// before any tensor is initialized (at the beginning of alloc_tensor_range).
// Hence, there is alway a buffer object in this vector. When each tensor is
// Hence, there is always a buffer object in this vector. When each tensor is
// being initialized, this original buffer object will be released if both
// flattening and small allocation are enabled, and additional buffer
// objects will be created in init_tensor to represent flattened quantized
@ -4132,7 +4132,7 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
//GGML_ASSERT(offset == 0);
// We create subbuffers from the original tensor buffer for scales and
// quants - i.e., scales and quants are aliases into the buffer obejct
// quants - i.e., scales and quants are aliases into the buffer object
// that backs the original tensor. This is a cleaner way to adapt to the
// new memory management.
// In the old code, we allocate new buffers for scales and quants

View file

@ -76,10 +76,10 @@ extern int g_ggml_sycl_prioritize_dmmv;
#define __SYCL_ARCH__ DPCT_COMPATIBILITY_TEMP
#define VER_4VEC 610 // todo for hardward optimize.
#define VER_GEN9 700 // todo for hardward optimize.
#define VER_GEN12 1000000 // todo for hardward optimize.
#define VER_GEN13 (VER_GEN12 + 1030) // todo for hardward optimize.
#define VER_4VEC 610 // todo for hardware optimize.
#define VER_GEN9 700 // todo for hardware optimize.
#define VER_GEN12 1000000 // todo for hardware optimize.
#define VER_GEN13 (VER_GEN12 + 1030) // todo for hardware optimize.
#define GGML_SYCL_MAX_NODES 8192 // TODO: adapt to hardwares

View file

@ -29,7 +29,7 @@ namespace ggml_sycl_reordered {
// [qs0, qs1, qs2, ..., qsN] [d0, d1, d2, ..., dN]
//
// Notes: out-of-bounds qs will run into d values
// Aligment relies on the allocated size of qs
// Alignment relies on the allocated size of qs
template <ggml_type type> struct block_q_t;

View file

@ -37,7 +37,7 @@ struct soft_max_params {
};
// When ncols_template == 0 the bounds for the loops in this function are not known and can't be unrolled.
// As we want to keep pragma unroll for all other cases we supress the clang transformation warning here.
// As we want to keep pragma unroll for all other cases we suppress the clang transformation warning here.
#ifdef __clang__
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wpass-failed"

View file

@ -90,7 +90,7 @@ if (Vulkan_FOUND)
target_include_directories(ggml-vulkan PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
# Workaround to the "can't dereference invalidated vector iterator" bug in clang-cl debug build
# Posssibly relevant: https://stackoverflow.com/questions/74748276/visual-studio-no-displays-the-correct-length-of-stdvector
# Possibly relevant: https://stackoverflow.com/questions/74748276/visual-studio-no-displays-the-correct-length-of-stdvector
if (MSVC AND CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
add_compile_definitions(_ITERATOR_DEBUG_LEVEL=0)
endif()