chore : correct typos [no ci] (#20041)
* fix(docs): correct typos found during code review Non-functional changes only: - Fixed minor spelling mistakes in comments - Corrected typos in user-facing strings - No variables, logic, or functional code was modified. Signed-off-by: Marcel Petrick <mail@marcelpetrick.it> * Update docs/backend/CANN.md Co-authored-by: Aaron Teo <taronaeo@gmail.com> * Revert "Auxiliary commit to revert individual files from 846d1c301281178efbc6ce6060ad34c1ebe45af8" This reverts commit 02fcf0c7db661d5ff3eff96b2b2db9fdb7213256. * Update tests/test-backend-ops.cpp Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * Update tests/test-backend-ops.cpp Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> --------- Signed-off-by: Marcel Petrick <mail@marcelpetrick.it> Co-authored-by: Aaron Teo <taronaeo@gmail.com> Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
This commit is contained in:
parent
7a99dc85e2
commit
92f7da00b4
81 changed files with 160 additions and 160 deletions
|
|
@ -259,7 +259,7 @@ extern "C" {
|
|||
Example usage:
|
||||
|
||||
// operations that use tensors allocated in a buffer with USAGE_WEIGHTS will be assigned
|
||||
// preferrably to run on the same backend as the buffer
|
||||
// preferably to run on the same backend as the buffer
|
||||
ggml_backend_buffer_set_usage(buf_weights, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
|
||||
|
||||
sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, NULL, num_backends, GGML_DEFAULT_GRAPH_SIZE, false, true);
|
||||
|
|
|
|||
|
|
@ -138,7 +138,7 @@ extern "C" {
|
|||
GGML_API ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params);
|
||||
GGML_API void ggml_opt_free(ggml_opt_context_t opt_ctx);
|
||||
|
||||
// set gradients to zero, initilize loss, and optionally reset the optimizer
|
||||
// set gradients to zero, initialize loss, and optionally reset the optimizer
|
||||
GGML_API void ggml_opt_reset(ggml_opt_context_t opt_ctx, bool optimizer);
|
||||
|
||||
GGML_API bool ggml_opt_static_graphs(ggml_opt_context_t opt_ctx); // whether the graphs are allocated_statically
|
||||
|
|
|
|||
|
|
@ -2575,7 +2575,7 @@ extern "C" {
|
|||
struct ggml_tensor * grad,
|
||||
struct ggml_tensor * sgd_params); // alpha, weight decay
|
||||
|
||||
// build forward mutiple tensors and select one of them for computing
|
||||
// build forward multiple tensors and select one of them for computing
|
||||
// this is useful for creating graphs that have constant topology but compute different things based on the input
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/18550
|
||||
//
|
||||
|
|
|
|||
|
|
@ -195,7 +195,7 @@ struct tile_config_t{
|
|||
// will be needed.
|
||||
//
|
||||
// Here another commonly used pattern 1-3-3 is skipped, as it is mostly used when m <=16;
|
||||
// and the sinlge batch gemm (m=1) has a special fast path with `avx512-vnni`.
|
||||
// and the single batch gemm (m=1) has a special fast path with `avx512-vnni`.
|
||||
//
|
||||
// ref: https://www.intel.com/content/www/us/en/developer/articles/code-sample/
|
||||
// advanced-matrix-extensions-intrinsics-functions.html
|
||||
|
|
@ -1379,8 +1379,8 @@ struct tinygemm_kernel_vnni<block_q8_0, block_q4_0, float, BLOCK_M, BLOCK_N, BLO
|
|||
// sum of offsets, shared across COLS
|
||||
//
|
||||
// avx512-vnni does not have `_mm512_dpbssd_epi32`,
|
||||
// need to transfrom ss to us:
|
||||
// a * (b - 8) is equavilent to b * a - 8 * a
|
||||
// need to transform ss to us:
|
||||
// a * (b - 8) is equivalent to b * a - 8 * a
|
||||
// s u u u s u s
|
||||
//
|
||||
__m512i vcomp;
|
||||
|
|
|
|||
|
|
@ -968,7 +968,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
|
||||
const int vector_length = ggml_cpu_get_sve_cnt()*8;
|
||||
|
||||
//VLA Implemenation for SVE
|
||||
//VLA Implementation for SVE
|
||||
switch (vector_length) {
|
||||
case 128:
|
||||
{
|
||||
|
|
|
|||
|
|
@ -781,7 +781,7 @@ void ggml_gemv_q4_K_8x8_q8_K(int n,
|
|||
|
||||
const uint8_t * q4_base = q4_ptr[b].qs + sb * QK_K;
|
||||
|
||||
// Load the 64 quants from q8K duplicated to use vecdots with the interelaved columns
|
||||
// Load the 64 quants from q8K duplicated to use vecdots with the interleaved columns
|
||||
// but still need the qs to use the low and hi bits from q4
|
||||
const int8_t * q8_base = q8_ptr[b].qs + sb * 64;
|
||||
int8x16_t q8_qs[8];
|
||||
|
|
@ -3796,7 +3796,7 @@ void ggml_gemm_q4_K_8x8_q8_K(int n,
|
|||
|
||||
for (int b = 0; b < nb; b++) {
|
||||
// bsums pairs belongs to the same q8_k subblock
|
||||
// 64 elemnts loaded and made sum of 0-7 and 8-15 sum || 16-23 and 24 - 31 sum
|
||||
// 64 elements loaded and made sum of 0-7 and 8-15 sum || 16-23 and 24 - 31 sum
|
||||
const int16x8_t bsums[4]{
|
||||
vpaddq_s16(vld1q_s16(q8_ptr[b].bsums + 16 * 0), vld1q_s16(q8_ptr[b].bsums + 16 * 0 + 8)),
|
||||
vpaddq_s16(vld1q_s16(q8_ptr[b].bsums + 16 * 1), vld1q_s16(q8_ptr[b].bsums + 16 * 1 + 8)),
|
||||
|
|
|
|||
|
|
@ -423,7 +423,7 @@ void ggml_quantize_mat_q8_K_4x8(const float * GGML_RESTRICT x, void * GGML_RESTR
|
|||
quants_interleaved[j] = i0;
|
||||
}
|
||||
|
||||
// Masks to shuffle the quants of corresonding sub blocks for rearraning quants for vectorized bsums computation
|
||||
// Masks to shuffle the quants of corresponding sub blocks for rearranging quants for vectorized bsums computation
|
||||
__m256i shuffle_mask_sb2 = _mm256_castsi128_si256(_mm_setr_epi8(0, 1, 0, 1, 4, 5, 6, 7, 8, 9, 8, 9, 12, 13, 14, 15));
|
||||
shuffle_mask_sb2 = _mm256_permute2f128_si256(shuffle_mask_sb2, shuffle_mask_sb2, 0);
|
||||
__m256i shuffle_mask_sb3 = _mm256_castsi128_si256(_mm_setr_epi8(0, 1, 2, 3, 0, 1, 6, 7, 8, 9, 10, 11, 8, 9, 14, 15));
|
||||
|
|
@ -625,7 +625,7 @@ static void gemv_q4_b32_8x8_q8_0_lut_avx(int n, float * GGML_RESTRICT s, size_t
|
|||
iacc = mul_sum_i8_pairs_acc_int32x8(iacc, _mm256_blend_epi32(rhs_vec_0123_3 ,_mm256_shuffle_epi32(rhs_vec_4567_3, 177), 170), _mm256_shuffle_epi32(lhs_vec_1, 170));
|
||||
iacc = mul_sum_i8_pairs_acc_int32x8(iacc, _mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_3, 177) ,rhs_vec_4567_3, 170), _mm256_shuffle_epi32(lhs_vec_1, 255));
|
||||
|
||||
// Accumulated values multipled with appropriate scales
|
||||
// Accumulated values multiplied with appropriate scales
|
||||
acc_row = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc), _mm256_mul_ps(col_scale_f32, row_scale_f32), acc_row);
|
||||
}
|
||||
|
||||
|
|
@ -868,7 +868,7 @@ static void gemm_q4_b32_8x8_q8_0_lut_avx(int n, float * GGML_RESTRICT s, size_t
|
|||
const __m128i row_scale_f16 = _mm_shuffle_epi32(_mm_maskload_epi32((int const*)(a_ptrs[rp][b].d), loadMask), 68);
|
||||
const __m512 row_scale_f32 = GGML_F32Cx16_REPEAT_LOAD(row_scale_f16);
|
||||
|
||||
// Multiply with appropiate scales and accumulate
|
||||
// Multiply with appropriate scales and accumulate
|
||||
acc_rows[rp * 4] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_0), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[rp * 4]);
|
||||
acc_rows[rp * 4 + 1] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_1), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[rp * 4 + 1]);
|
||||
acc_rows[rp * 4 + 2] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_2), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[rp * 4 + 2]);
|
||||
|
|
@ -1076,7 +1076,7 @@ static void gemm_q4_b32_8x8_q8_0_lut_avx(int n, float * GGML_RESTRICT s, size_t
|
|||
const __m128i row_scale_f16 = _mm_shuffle_epi32(_mm_maskload_epi32((int const*)(a_ptr[b].d), loadMask), 68);
|
||||
const __m512 row_scale_f32 = GGML_F32Cx16_REPEAT_LOAD(row_scale_f16);
|
||||
|
||||
// Multiply with appropiate scales and accumulate
|
||||
// Multiply with appropriate scales and accumulate
|
||||
acc_rows[0] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_0), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[0]);
|
||||
acc_rows[1] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_1), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[1]);
|
||||
acc_rows[2] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_2), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[2]);
|
||||
|
|
@ -1257,7 +1257,7 @@ static void gemm_q4_b32_8x8_q8_0_lut_avx(int n, float * GGML_RESTRICT s, size_t
|
|||
// Load the scale(d) values for all the 4 Q8_0 blocks and repeat it across lanes
|
||||
const __m256 row_scale_f32 = GGML_F32Cx8_REPEAT_LOAD(a_ptrs[rp][b].d, loadMask);
|
||||
|
||||
// Multiply with appropiate scales and accumulate
|
||||
// Multiply with appropriate scales and accumulate
|
||||
acc_rows[rp * 4] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_0), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[rp * 4]);
|
||||
acc_rows[rp * 4 + 1] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_1), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[rp * 4 + 1]);
|
||||
acc_rows[rp * 4 + 2] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_2), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[rp * 4 + 2]);
|
||||
|
|
@ -1428,7 +1428,7 @@ static void gemm_q4_b32_8x8_q8_0_lut_avx(int n, float * GGML_RESTRICT s, size_t
|
|||
// Load the scale(d) values for all the 4 Q8_0 blocks and repeat it across lanes
|
||||
const __m256 row_scale_f32 = GGML_F32Cx8_REPEAT_LOAD(a_ptr[b].d, loadMask);
|
||||
|
||||
// Multiply with appropiate scales and accumulate
|
||||
// Multiply with appropriate scales and accumulate
|
||||
acc_rows[0] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_0), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[0]);
|
||||
acc_rows[1] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_1), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[1]);
|
||||
acc_rows[2] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_2), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[2]);
|
||||
|
|
@ -1612,7 +1612,7 @@ void ggml_gemv_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|||
lhs_vec_11 = _mm256_permute2f128_si256(lhs_vec_11, lhs_vec_11, 0);
|
||||
|
||||
// Dot product done within 32 bit lanes and accumulated in the same vector
|
||||
// First done for first sub block and thenn for second sub block in each sb
|
||||
// First done for first sub block and then for second sub block in each sb
|
||||
// B0(0-3) B4(0-3) B1(0-3) B5(0-3) B2(0-3) B6(0-3) B3(0-3) B7(0-3) with A0(0-3)
|
||||
// B0(4-7) B4(4-7) B1(4-7) B5(4-7) B2(4-7) B6(4-7) B3(4-7) B7(4-7) with A0(4-7)
|
||||
// ...........................................................................
|
||||
|
|
@ -2422,7 +2422,7 @@ void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|||
const __m256 row_scale_f32_ymm = _mm256_set_m128(row_scale_f32_sse, row_scale_f32_sse);
|
||||
const __m512 row_scale_f32 = _mm512_insertf32x8(_mm512_castps256_ps512(row_scale_f32_ymm), row_scale_f32_ymm, 1);
|
||||
|
||||
// Multiply with appropiate scales and accumulate (for both d and dmin) below
|
||||
// Multiply with appropriate scales and accumulate (for both d and dmin) below
|
||||
acc_rows[rp * 4] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_0), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[rp * 4]);
|
||||
acc_rows[rp * 4 + 1] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_1), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[rp * 4 + 1]);
|
||||
acc_rows[rp * 4 + 2] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_2), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[rp * 4 + 2]);
|
||||
|
|
@ -2785,7 +2785,7 @@ void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|||
const __m256 row_scale_f32_ymm = _mm256_set_m128(row_scale_f32_sse, row_scale_f32_sse);
|
||||
const __m512 row_scale_f32 = _mm512_insertf32x8(_mm512_castps256_ps512(row_scale_f32_ymm), row_scale_f32_ymm, 1);
|
||||
|
||||
// Multiply with appropiate scales and accumulate (for both d and dmin) below
|
||||
// Multiply with appropriate scales and accumulate (for both d and dmin) below
|
||||
acc_rows[0] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_0), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[0]);
|
||||
acc_rows[1] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_1), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[1]);
|
||||
acc_rows[2] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_2), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[2]);
|
||||
|
|
@ -2802,7 +2802,7 @@ void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|||
acc_min_rows[3] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_min_3), _mm512_mul_ps(col_dmin_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 255)), acc_min_rows[3]);
|
||||
}
|
||||
}
|
||||
// Store accumlated values
|
||||
// Store accumulated values
|
||||
for (int i = 0; i < 4; i++) {
|
||||
_mm512_storeu_ps((float * )(s + ((y * 4 + i) * bs + x * 8)), _mm512_sub_ps(acc_rows[i], acc_min_rows[i]));
|
||||
}
|
||||
|
|
@ -3130,7 +3130,7 @@ void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|||
const __m128 row_scale_f32_sse = _mm_load_ps(a_ptrs[rp][b].d);
|
||||
const __m256 row_scale_f32 = _mm256_set_m128(row_scale_f32_sse, row_scale_f32_sse);//GGML_F32Cx8_REPEAT_LOAD(a_ptrs[rp][b].d, loadMask);
|
||||
|
||||
// Multiply with appropiate scales and accumulate (for both d and dmin) below
|
||||
// Multiply with appropriate scales and accumulate (for both d and dmin) below
|
||||
acc_rows[rp * 4] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_0), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[rp * 4]);
|
||||
acc_rows[rp * 4 + 1] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_1), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[rp * 4 + 1]);
|
||||
acc_rows[rp * 4 + 2] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_2), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[rp * 4 + 2]);
|
||||
|
|
@ -3460,7 +3460,7 @@ void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|||
const __m128 row_scale_f32_sse = _mm_load_ps(a_ptr[b].d);
|
||||
const __m256 row_scale_f32 = _mm256_set_m128(row_scale_f32_sse, row_scale_f32_sse); //GGML_F32Cx8_REPEAT_LOAD(a_ptrs[rp][b].d, loadMask);
|
||||
|
||||
// Multiply with appropiate scales and accumulate (for both d and dmin) below
|
||||
// Multiply with appropriate scales and accumulate (for both d and dmin) below
|
||||
acc_rows[0] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_0), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[0]);
|
||||
acc_rows[1] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_1), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[1]);
|
||||
acc_rows[2] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_2), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[2]);
|
||||
|
|
@ -4268,7 +4268,7 @@ void ggml_gemm_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|||
const __m256 row_scale_f32_ymm = _mm256_set_m128(row_scale_f32_sse, row_scale_f32_sse);
|
||||
const __m512 row_scale_f32 = _mm512_insertf32x8(_mm512_castps256_ps512(row_scale_f32_ymm), row_scale_f32_ymm, 1);
|
||||
|
||||
// Multiply with appropiate scales and accumulate (for both d and dmin) below
|
||||
// Multiply with appropriate scales and accumulate (for both d and dmin) below
|
||||
acc_rows[rp * 4] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_0), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[rp * 4]);
|
||||
acc_rows[rp * 4 + 1] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_1), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[rp * 4 + 1]);
|
||||
acc_rows[rp * 4 + 2] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_2), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[rp * 4 + 2]);
|
||||
|
|
@ -5035,7 +5035,7 @@ void ggml_gemm_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|||
acc_min_rows[3] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_min_3), _mm512_mul_ps(col_dmin_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 255)), acc_min_rows[3]);
|
||||
}
|
||||
}
|
||||
// Store accumlated values
|
||||
// Store accumulated values
|
||||
for (int i = 0; i < 4; i++) {
|
||||
_mm512_storeu_ps((float * )(s + ((y * 4 + i) * bs + x * 8)), _mm512_sub_ps(acc_rows[i], acc_min_rows[i]));
|
||||
}
|
||||
|
|
@ -5677,7 +5677,7 @@ void ggml_gemm_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|||
const __m128 row_scale_f32_sse = _mm_load_ps(a_ptrs[rp][b].d);
|
||||
const __m256 row_scale_f32 = _mm256_set_m128(row_scale_f32_sse, row_scale_f32_sse);
|
||||
|
||||
// Multiply with appropiate scales and accumulate (for both d and dmin) below
|
||||
// Multiply with appropriate scales and accumulate (for both d and dmin) below
|
||||
acc_rows[rp * 4] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_0), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[rp * 4]);
|
||||
acc_rows[rp * 4 + 1] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_1), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[rp * 4 + 1]);
|
||||
acc_rows[rp * 4 + 2] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_2), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[rp * 4 + 2]);
|
||||
|
|
@ -6349,7 +6349,7 @@ void ggml_gemm_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|||
const __m128 row_scale_f32_sse = _mm_load_ps(a_ptr[b].d);
|
||||
const __m256 row_scale_f32 = _mm256_set_m128(row_scale_f32_sse, row_scale_f32_sse);
|
||||
|
||||
// Multiply with appropiate scales and accumulate (for both d and dmin) below
|
||||
// Multiply with appropriate scales and accumulate (for both d and dmin) below
|
||||
acc_rows[0] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_0), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[0]);
|
||||
acc_rows[1] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_1), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[1]);
|
||||
acc_rows[2] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_2), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[2]);
|
||||
|
|
|
|||
|
|
@ -2477,7 +2477,7 @@ static bool ggml_thread_apply_priority(int32_t prio) {
|
|||
|
||||
if (prio != GGML_SCHED_PRIO_LOW) {
|
||||
// Tell Windows that this thread should not be throttled (needs its own CPU core).
|
||||
// Newer Windows 11 versions aggresively park (offline) CPU cores and often place
|
||||
// Newer Windows 11 versions aggressively park (offline) CPU cores and often place
|
||||
// all our threads onto the first 4 cores which results in terrible performance with
|
||||
// n_threads > 4
|
||||
#if _WIN32_WINNT >= 0x0602
|
||||
|
|
|
|||
|
|
@ -533,7 +533,7 @@ class tinyBLAS {
|
|||
if constexpr (RN > 1) {
|
||||
return mnpack<RM, RN-1, BM>(m, n, SIZE_N, BN);
|
||||
} else {
|
||||
GGML_LOG_ERROR("mnpack<%d, %d> bloc size not supported\n", RM, (int)SIZE_N);
|
||||
GGML_LOG_ERROR("mnpack<%d, %d> block size not supported\n", RM, (int)SIZE_N);
|
||||
GGML_ASSERT(false); // we have miss something.
|
||||
}
|
||||
}
|
||||
|
|
@ -711,7 +711,7 @@ class tinyBLAS_RVV {
|
|||
if constexpr (RN > 1) {
|
||||
return mnpack<RM, RN-1, BM>(m, n, SIZE_N, BN);
|
||||
} else {
|
||||
GGML_LOG_ERROR("mnpack<%d, %d> bloc size not supported\n", RM, (int)SIZE_N);
|
||||
GGML_LOG_ERROR("mnpack<%d, %d> block size not supported\n", RM, (int)SIZE_N);
|
||||
GGML_ASSERT(false); // we have miss something.
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -375,7 +375,7 @@ static void ggml_compute_forward_dup_bytes(
|
|||
const size_t rs = ne00 * type_size;
|
||||
|
||||
if (nb00 == type_size) {
|
||||
// src0 is contigous on first dimension, copy by rows
|
||||
// src0 is contiguous on first dimension, copy by rows
|
||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
||||
id += rs * ir0;
|
||||
|
|
@ -1795,7 +1795,7 @@ void ggml_compute_forward_repeat(
|
|||
{
|
||||
ggml_compute_forward_repeat_f32(params, dst);
|
||||
} break;
|
||||
// TODO: templateify the implemenation and support for I64
|
||||
// TODO: templateify the implementation and support for I64
|
||||
// ref https://github.com/ggml-org/llama.cpp/pull/14274#discussion_r2169492225
|
||||
//case GGML_TYPE_I64:
|
||||
// {
|
||||
|
|
|
|||
|
|
@ -3032,7 +3032,7 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
|
|||
case GGML_OP_MUL_MAT_ID:
|
||||
{
|
||||
size = ggml_row_size(PARAM_TYPE, ggml_nelements(op->src[1]));
|
||||
size = GGML_PAD(size, sizeof(int64_t)); // + padding for next bloc.
|
||||
size = GGML_PAD(size, sizeof(int64_t)); // + padding for next block.
|
||||
|
||||
const int64_t ne02 = op->src[0]->ne[2]; // n_as, n_expert
|
||||
const int64_t ne12 = op->src[1]->ne[2]; // n_tokens
|
||||
|
|
@ -3297,7 +3297,7 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
|
|||
auto * wdata = (char *)params->wdata;
|
||||
auto * wdata_src1_end = (char *)wdata + GGML_PAD(nbw3, sizeof(int64_t));
|
||||
|
||||
// total of [n_as][ne12 + 1] elemets of type mmid_row_mapping (2*int32_t = int64_t)
|
||||
// total of [n_as][ne12 + 1] elements of type mmid_row_mapping (2*int32_t = int64_t)
|
||||
auto * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as]
|
||||
struct mmid_row_mapping * matrix_rows = (struct mmid_row_mapping *) (matrix_row_counts + n_as); // [n_as][ne12]
|
||||
|
||||
|
|
|
|||
|
|
@ -1215,7 +1215,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
|
|||
}
|
||||
|
||||
// If attention sinks are used, potentially re-scale if KQ_max is small.
|
||||
// Also add the sink as a value to KQ_rowsum, this is done after synchonization of KQ_rowsum
|
||||
// Also add the sink as a value to KQ_rowsum, this is done after synchronization of KQ_rowsum
|
||||
// so it's being done unconditionally for every thread.
|
||||
if (!is_fixup && (np == 1 || threadIdx.y % np == 0) && sinks_f) {
|
||||
float KQ_max_scale[cols_per_thread];
|
||||
|
|
|
|||
|
|
@ -10,7 +10,7 @@ static constexpr __device__ int ggml_cuda_fattn_vec_get_nthreads_device() {
|
|||
return 128;
|
||||
}
|
||||
|
||||
// Currenlty llvm with the amdgcn target does not support unrolling loops
|
||||
// Currently llvm with the amdgcn target does not support unrolling loops
|
||||
// that contain a break that can not be resolved at compile time.
|
||||
#ifdef __clang__
|
||||
#pragma clang diagnostic push
|
||||
|
|
|
|||
|
|
@ -18,7 +18,7 @@
|
|||
#if defined(RDNA4) && ROCWMMA_VERSION_MAJOR > 1
|
||||
#define GGML_USE_WMMA_FATTN
|
||||
#elif defined(RDNA4)
|
||||
#warning "rocwmma fattn is not suported on RDNA4 on rocwmma < v2.0.0, expect degraded performance"
|
||||
#warning "rocwmma fattn is not supported on RDNA4 on rocwmma < v2.0.0, expect degraded performance"
|
||||
#endif // defined(RDNA4) && ROCWMMA_VERSION_MAJOR > 1
|
||||
#endif // defined(GGML_HIP_ROCWMMA_FATTN)
|
||||
|
||||
|
|
|
|||
|
|
@ -3330,7 +3330,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph,
|
|||
return false;
|
||||
}
|
||||
|
||||
//rms_norm kernel assumes contigous rows
|
||||
//rms_norm kernel assumes contiguous rows
|
||||
if (!ggml_is_contiguous_rows(mul->src[0]) || !ggml_is_contiguous_rows(mul->src[1])) {
|
||||
return false;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -235,7 +235,7 @@ static __global__ void quantize_mmq_q8_1(
|
|||
q.z = roundf(xi.z*d_inv);
|
||||
q.w = roundf(xi.w*d_inv);
|
||||
|
||||
// Write back 4 int8 values as a single 32 bit value for better memroy bandwidth:
|
||||
// Write back 4 int8 values as a single 32 bit value for better memory bandwidth:
|
||||
char4 * yqs4 = (char4 *) y[ib].qs;
|
||||
yqs4[iqs/4] = q;
|
||||
|
||||
|
|
|
|||
|
|
@ -46,7 +46,7 @@ struct soft_max_params {
|
|||
};
|
||||
|
||||
// When ncols_template == 0 the bounds for the loops in this function are not known and can't be unrolled.
|
||||
// As we want to keep pragma unroll for all other cases we supress the clang transformation warning here.
|
||||
// As we want to keep pragma unroll for all other cases we suppress the clang transformation warning here.
|
||||
#ifdef __clang__
|
||||
#pragma clang diagnostic push
|
||||
#pragma clang diagnostic ignored "-Wpass-failed"
|
||||
|
|
|
|||
|
|
@ -83,7 +83,7 @@ static void solve_tri_f32_cublas(ggml_backend_cuda_context & ctx,
|
|||
// ======================
|
||||
// When ncols_template == 0 the bounds for the loops in this function are not
|
||||
// known and can't be unrolled. As we want to keep pragma unroll for all other
|
||||
// cases we supress the clang transformation warning here.
|
||||
// cases we suppress the clang transformation warning here.
|
||||
#ifdef __clang__
|
||||
# pragma clang diagnostic push
|
||||
# pragma clang diagnostic ignored "-Wpass-failed"
|
||||
|
|
|
|||
|
|
@ -139,7 +139,7 @@ struct ggml_hexagon_session {
|
|||
};
|
||||
|
||||
void ggml_hexagon_session::enqueue(struct htp_general_req &req, struct dspqueue_buffer *bufs, uint32_t n_bufs, bool sync) {
|
||||
// Bump pending flag (cleared in the session::flush once we get the responce)
|
||||
// Bump pending flag (cleared in the session::flush once we get the response)
|
||||
this->op_pending++; // atomic inc
|
||||
|
||||
int err = dspqueue_write(this->queue,
|
||||
|
|
@ -443,7 +443,7 @@ static void repack_row_q4x4x2(uint8_t * y, const block_q4_0 * x, int64_t k) {
|
|||
|
||||
// Repack the scales
|
||||
// Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_Q4_0x4x2)
|
||||
// the last block is truncated and overriden by the scales.
|
||||
// the last block is truncated and overridden by the scales.
|
||||
for (int i = 0; i < nb; i++) {
|
||||
// Repack the scales
|
||||
ggml_half * d = (ggml_half *) (y_d + i * dblk_size);
|
||||
|
|
@ -503,7 +503,7 @@ static void unpack_row_q4x4x2(block_q4_0 * x, const uint8_t * y, int64_t k) {
|
|||
|
||||
// Repack the scales
|
||||
// Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_Q4_0x4x2)
|
||||
// the last block is truncated and overriden by the scales.
|
||||
// the last block is truncated and overridden by the scales.
|
||||
for (int i = 0; i < nb; i++) {
|
||||
// Unpack the scales
|
||||
const ggml_half * d = (const ggml_half *) (y_d + i * dblk_size);
|
||||
|
|
@ -552,7 +552,7 @@ static void init_row_q4x4x2(block_q4_0 * x, int64_t k) {
|
|||
|
||||
// Init the scales
|
||||
// Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_Q4_0x4x2)
|
||||
// the last block is truncated and overriden by the scales.
|
||||
// the last block is truncated and overridden by the scales.
|
||||
for (int i = 0; i < nb; i++) {
|
||||
// Unpack the scales
|
||||
x[i * 8 + 0].d = 0;
|
||||
|
|
@ -770,7 +770,7 @@ static void repack_row_q8x4x2(uint8_t * y, const block_q8_0 * x, int64_t k) {
|
|||
|
||||
// Repack the scales
|
||||
// Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_Q4_0x4x2)
|
||||
// the last block is truncated and overriden by the scales.
|
||||
// the last block is truncated and overridden by the scales.
|
||||
for (int i = 0; i < nb; i++) {
|
||||
// Repack the scales
|
||||
ggml_half * d = (ggml_half *) (y_d + i * dblk_size);
|
||||
|
|
@ -829,7 +829,7 @@ static void unpack_row_q8x4x2(block_q8_0 * x, const uint8_t * y, int64_t k) {
|
|||
|
||||
// Repack the scales
|
||||
// Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_Q4_0x4x2)
|
||||
// the last block is truncated and overriden by the scales.
|
||||
// the last block is truncated and overridden by the scales.
|
||||
for (int i = 0; i < nb; i++) {
|
||||
// Unpack the scales
|
||||
const ggml_half * d = (const ggml_half *) (y_d + i * dblk_size);
|
||||
|
|
@ -878,7 +878,7 @@ static void init_row_q8x4x2(block_q8_0 * x, int64_t k) {
|
|||
|
||||
// Init the scales
|
||||
// Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_Q8_0x4x2)
|
||||
// the last block is truncated and overriden by the scales.
|
||||
// the last block is truncated and overridden by the scales.
|
||||
for (int i = 0; i < nb; i++) {
|
||||
// Unpack the scales
|
||||
x[i * 8 + 0].d = 0;
|
||||
|
|
@ -1120,7 +1120,7 @@ static void repack_row_mxfp4x4x2(uint8_t * y, const block_mxfp4 * x, int64_t k)
|
|||
|
||||
// Repack the scales
|
||||
// Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_MXFP4x4x2)
|
||||
// the last block is truncated and overriden by the scales.
|
||||
// the last block is truncated and overridden by the scales.
|
||||
for (int i = 0; i < nb; i++) {
|
||||
// Repack the scales
|
||||
uint8_t * e = (uint8_t *) (y_e + i * eblk_size);
|
||||
|
|
@ -1180,7 +1180,7 @@ static void unpack_row_mxfp4x4x2(block_mxfp4 * x, const uint8_t * y, int64_t k)
|
|||
|
||||
// Repack the scales
|
||||
// Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_MXFP4_0x4x2)
|
||||
// the last block is truncated and overriden by the scales.
|
||||
// the last block is truncated and overridden by the scales.
|
||||
for (int i = 0; i < nb; i++) {
|
||||
// Unpack the scales
|
||||
const uint8_t * e = (const uint8_t *) (y_e + i * eblk_size);
|
||||
|
|
@ -1229,7 +1229,7 @@ static void init_row_mxfp4x4x2(block_mxfp4 * x, int64_t k) {
|
|||
|
||||
// Init the scales
|
||||
// Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_MXFP4x4x2)
|
||||
// the last block is truncated and overriden by the scales.
|
||||
// the last block is truncated and overridden by the scales.
|
||||
for (int i = 0; i < nb; i++) {
|
||||
// Unpack the scales
|
||||
x[i * 8 + 0].e = 0;
|
||||
|
|
@ -2670,7 +2670,7 @@ static std::vector<int> ggml_hexagon_graph_optimize_reorder(const std::vector<no
|
|||
// The main goal here is to stack the MUL_MAT ops with the same src1 input.
|
||||
// This allows use to reuse dynamically quantized src1 in VTCM.
|
||||
|
||||
// TODO: the current version might do incorrect reodering in cases where quantized src0
|
||||
// TODO: the current version might do incorrect reordering in cases where quantized src0
|
||||
// input is an output of another Op.
|
||||
|
||||
for (int i0 = 0; i0 < n; i0++) {
|
||||
|
|
|
|||
|
|
@ -282,7 +282,7 @@ static std::string get_driver_path() {
|
|||
// Replace \SystemRoot with an absolute path from system ENV windir
|
||||
const std::wstring systemRootEnv = L"windir";
|
||||
|
||||
// Query the number of wide charactors this variable requires
|
||||
// Query the number of wide characters this variable requires
|
||||
DWORD numWords = GetEnvironmentVariableW(systemRootEnv.c_str(), NULL, 0);
|
||||
if (numWords == 0) {
|
||||
GGML_LOG_ERROR("ggml-hex: Failed get systemRoot environment variable\n");
|
||||
|
|
|
|||
|
|
@ -67,7 +67,7 @@ static inline HVX_Vector hvx_vec_inverse_f16(HVX_Vector vals) {
|
|||
|
||||
HVX_Vector vcl0 = Q6_Vuh_vcl0_Vuh(rm); //count leading zeros
|
||||
|
||||
// Get mantissa for 16-bit represenation
|
||||
// Get mantissa for 16-bit representation
|
||||
HVX_Vector mant_recip = Q6_V_vand_VV(Q6_Vh_vasr_VhR(Q6_Vh_vasl_VhVh(rm, vcl0), 5), Q6_Vh_vsplat_R(0x03FF));
|
||||
|
||||
//Compute Reciprocal Exponent
|
||||
|
|
|
|||
|
|
@ -18,7 +18,7 @@
|
|||
#include "htp-msg.h"
|
||||
#include "htp-ops.h"
|
||||
|
||||
// Redefined the types GGML_ROPE_TYPE_NORMAL & GGML_ROPE_TYPE_NEOX as we cant include ggml.h
|
||||
// Redefined the types GGML_ROPE_TYPE_NORMAL & GGML_ROPE_TYPE_NEOX as we can't include ggml.h
|
||||
#define HTP_ROPE_TYPE_NORMAL 0
|
||||
#define HTP_ROPE_TYPE_NEOX 2
|
||||
|
||||
|
|
|
|||
|
|
@ -56,7 +56,7 @@ static void worker_pool_main(void * context) {
|
|||
unsigned int n = atomic_load(&pool->n_jobs);
|
||||
unsigned int i = atomic_fetch_add(&pool->next_job, 1);
|
||||
if (i >= n) {
|
||||
// Spurios wakeup
|
||||
// Spurious wakeup
|
||||
continue;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1281,7 +1281,7 @@ struct ggml_metal_buffer {
|
|||
bool use_residency_sets;
|
||||
|
||||
// optional MTLResidencySet
|
||||
// note: cannot use explicity "id<MTLResidencySet>" here because it is not available on certain OSes
|
||||
// note: cannot use explicitly "id<MTLResidencySet>" here because it is not available on certain OSes
|
||||
id rset;
|
||||
|
||||
// pointers to global device
|
||||
|
|
|
|||
|
|
@ -631,7 +631,7 @@ int ggml_metal_op_acc(ggml_metal_op_t ctx, int idx) {
|
|||
const bool inplace = (bool) ((const int32_t *) op->op_params)[4];
|
||||
|
||||
if (!inplace) {
|
||||
// run a separete kernel to cpy src->dst
|
||||
// run a separate kernel to cpy src->dst
|
||||
// not sure how to avoid this
|
||||
// TODO: make a simpler cpy_bytes kernel
|
||||
|
||||
|
|
@ -1644,7 +1644,7 @@ int ggml_metal_op_set(ggml_metal_op_t ctx, int idx) {
|
|||
const bool inplace = (bool) ((const int32_t *) op->op_params)[4];
|
||||
|
||||
if (!inplace) {
|
||||
// run a separete kernel to cpy src->dst
|
||||
// run a separate kernel to cpy src->dst
|
||||
// not sure how to avoid this
|
||||
// TODO: make a simpler cpy_bytes kernel
|
||||
|
||||
|
|
@ -2005,7 +2005,7 @@ int ggml_metal_op_mul_mat(ggml_metal_op_t ctx, int idx) {
|
|||
const int16_t r0ptg = nypsg*nsg; // num src0 rows per threadgroup
|
||||
int16_t r1ptg = 4; // num src1 rows per threadgroup
|
||||
|
||||
// note: not sure how optimal are those across all different hardware. there might be someting cleverer
|
||||
// note: not sure how optimal are those across all different hardware. there might be something cleverer
|
||||
switch (ne11) {
|
||||
case 2:
|
||||
r1ptg = 2; break;
|
||||
|
|
|
|||
|
|
@ -14,7 +14,7 @@
|
|||
#define GGML_METAL_MAX_DEVICES 16
|
||||
|
||||
// number of Metal devices
|
||||
// note: can be overriden with GGML_METAL_DEVICES env to simulate virtual devices
|
||||
// note: can be overridden with GGML_METAL_DEVICES env to simulate virtual devices
|
||||
static int g_devices = 1;
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
|
|
|||
|
|
@ -4218,7 +4218,7 @@ kernel void kernel_im2col(
|
|||
template [[host_name("kernel_im2col_f32")]] kernel im2col_t kernel_im2col<float>;
|
||||
template [[host_name("kernel_im2col_f16")]] kernel im2col_t kernel_im2col<half>;
|
||||
|
||||
// TODO: obolete -- remove
|
||||
// TODO: obsolete -- remove
|
||||
//typedef void (im2col_ext_t)(
|
||||
// constant ggml_metal_kargs_im2col & args,
|
||||
// device const float * x,
|
||||
|
|
|
|||
|
|
@ -313,7 +313,7 @@ struct ProfilingInfo {
|
|||
cl_ulong cmd_duration_ns;
|
||||
// The time for the kernel to complete - COMPLETE - END
|
||||
cl_ulong cmd_complete_duration_ns;
|
||||
// Total time to finish the kernel - COMPELTE - QUEUED
|
||||
// Total time to finish the kernel - COMPLETE - QUEUED
|
||||
cl_ulong cmd_total_duration_ns;
|
||||
// Global and local work sizes.
|
||||
size_t global_size[3];
|
||||
|
|
@ -2555,7 +2555,7 @@ static std::vector<ggml_backend_device> ggml_opencl_probe_devices(ggml_backend_r
|
|||
|
||||
cl_platform_id platform_ids[NPLAT];
|
||||
if (clGetPlatformIDs(NPLAT, platform_ids, &n_platforms) != CL_SUCCESS) {
|
||||
GGML_LOG_ERROR("ggml_opencl: plaform IDs not available.\n");
|
||||
GGML_LOG_ERROR("ggml_opencl: platform IDs not available.\n");
|
||||
return found_devices;
|
||||
}
|
||||
|
||||
|
|
@ -3339,7 +3339,7 @@ static void ggml_backend_opencl_synchronize(ggml_backend_t backend) {
|
|||
CL_CHECK(clReleaseEvent(evt));
|
||||
}
|
||||
|
||||
// Syncronizes the 'backend_ctx's device with others so that commands
|
||||
// Synchronizes the 'backend_ctx's device with others so that commands
|
||||
// enqueued to it won't start until commands in the other devices have
|
||||
// completed.
|
||||
static void sync_with_other_backends(ggml_backend_opencl_context * backend_ctx) {
|
||||
|
|
@ -3997,7 +3997,7 @@ struct ggml_backend_opencl_buffer_context {
|
|||
|
||||
// The buffer_context is initially created by ggml_backend_buft_alloc_buffer
|
||||
// before any tensor is initialized (at the beginning of alloc_tensor_range).
|
||||
// Hence, there is alway a buffer object in this vector. When each tensor is
|
||||
// Hence, there is always a buffer object in this vector. When each tensor is
|
||||
// being initialized, this original buffer object will be released if both
|
||||
// flattening and small allocation are enabled, and additional buffer
|
||||
// objects will be created in init_tensor to represent flattened quantized
|
||||
|
|
@ -4132,7 +4132,7 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
|||
//GGML_ASSERT(offset == 0);
|
||||
|
||||
// We create subbuffers from the original tensor buffer for scales and
|
||||
// quants - i.e., scales and quants are aliases into the buffer obejct
|
||||
// quants - i.e., scales and quants are aliases into the buffer object
|
||||
// that backs the original tensor. This is a cleaner way to adapt to the
|
||||
// new memory management.
|
||||
// In the old code, we allocate new buffers for scales and quants
|
||||
|
|
|
|||
|
|
@ -76,10 +76,10 @@ extern int g_ggml_sycl_prioritize_dmmv;
|
|||
|
||||
|
||||
#define __SYCL_ARCH__ DPCT_COMPATIBILITY_TEMP
|
||||
#define VER_4VEC 610 // todo for hardward optimize.
|
||||
#define VER_GEN9 700 // todo for hardward optimize.
|
||||
#define VER_GEN12 1000000 // todo for hardward optimize.
|
||||
#define VER_GEN13 (VER_GEN12 + 1030) // todo for hardward optimize.
|
||||
#define VER_4VEC 610 // todo for hardware optimize.
|
||||
#define VER_GEN9 700 // todo for hardware optimize.
|
||||
#define VER_GEN12 1000000 // todo for hardware optimize.
|
||||
#define VER_GEN13 (VER_GEN12 + 1030) // todo for hardware optimize.
|
||||
|
||||
#define GGML_SYCL_MAX_NODES 8192 // TODO: adapt to hardwares
|
||||
|
||||
|
|
|
|||
|
|
@ -29,7 +29,7 @@ namespace ggml_sycl_reordered {
|
|||
// [qs0, qs1, qs2, ..., qsN] [d0, d1, d2, ..., dN]
|
||||
//
|
||||
// Notes: out-of-bounds qs will run into d values
|
||||
// Aligment relies on the allocated size of qs
|
||||
// Alignment relies on the allocated size of qs
|
||||
|
||||
template <ggml_type type> struct block_q_t;
|
||||
|
||||
|
|
|
|||
|
|
@ -37,7 +37,7 @@ struct soft_max_params {
|
|||
};
|
||||
|
||||
// When ncols_template == 0 the bounds for the loops in this function are not known and can't be unrolled.
|
||||
// As we want to keep pragma unroll for all other cases we supress the clang transformation warning here.
|
||||
// As we want to keep pragma unroll for all other cases we suppress the clang transformation warning here.
|
||||
#ifdef __clang__
|
||||
#pragma clang diagnostic push
|
||||
#pragma clang diagnostic ignored "-Wpass-failed"
|
||||
|
|
|
|||
|
|
@ -90,7 +90,7 @@ if (Vulkan_FOUND)
|
|||
target_include_directories(ggml-vulkan PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
|
||||
|
||||
# Workaround to the "can't dereference invalidated vector iterator" bug in clang-cl debug build
|
||||
# Posssibly relevant: https://stackoverflow.com/questions/74748276/visual-studio-no-displays-the-correct-length-of-stdvector
|
||||
# Possibly relevant: https://stackoverflow.com/questions/74748276/visual-studio-no-displays-the-correct-length-of-stdvector
|
||||
if (MSVC AND CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
|
||||
add_compile_definitions(_ITERATOR_DEBUG_LEVEL=0)
|
||||
endif()
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue