67 lines
2.9 KiB
Diff
67 lines
2.9 KiB
Diff
Fix SIGSEGV when compiled with avx-512 instructions.
|
|
|
|
Due to unaligned allocations, library crashes in
|
|
nontemporalMemcpy in _mm512_stream_si512 (which requires
|
|
64-aligned allocations, but used to copy default-aligned objects).
|
|
|
|
Without this patch hipamd causes random crashes in hipMemcpy* callers
|
|
(tensile, rocBLAS, miopen, rocThrust, etc.).
|
|
|
|
Bug: https://bugs.gentoo.org/915969
|
|
Bug report in upstream: https://github.com/ROCm-Developer-Tools/clr/issues/18
|
|
--- a/rocclr/device/rocm/rocvirtual.cpp
|
|
+++ b/rocclr/device/rocm/rocvirtual.cpp
|
|
@@ -2790,44 +2790,6 @@ bool VirtualGPU::createVirtualQueue(uint deviceQueueSize)
|
|
return true;
|
|
}
|
|
|
|
-// ================================================================================================
|
|
-__attribute__((optimize("unroll-all-loops"), always_inline))
|
|
-static inline void nontemporalMemcpy(void* __restrict dst, const void* __restrict src,
|
|
- uint16_t size) {
|
|
- #if defined(__AVX512F__)
|
|
- for (auto i = 0u; i != size / sizeof(__m512i); ++i) {
|
|
- _mm512_stream_si512(reinterpret_cast<__m512i* __restrict&>(dst)++,
|
|
- *reinterpret_cast<const __m512i* __restrict&>(src)++);
|
|
- }
|
|
- size = size % sizeof(__m512i);
|
|
- #endif
|
|
-
|
|
- #if defined(__AVX__)
|
|
- for (auto i = 0u; i != size / sizeof(__m256i); ++i) {
|
|
- _mm256_stream_si256(reinterpret_cast<__m256i* __restrict&>(dst)++,
|
|
- *reinterpret_cast<const __m256i* __restrict&>(src)++);
|
|
- }
|
|
- size = size % sizeof(__m256i);
|
|
- #endif
|
|
-
|
|
- for (auto i = 0u; i != size / sizeof(__m128i); ++i) {
|
|
- _mm_stream_si128(reinterpret_cast<__m128i* __restrict&>(dst)++,
|
|
- *(reinterpret_cast<const __m128i* __restrict&>(src)++));
|
|
- }
|
|
- size = size % sizeof(__m128i);
|
|
-
|
|
- for (auto i = 0u; i != size / sizeof(long long); ++i) {
|
|
- _mm_stream_si64(reinterpret_cast<long long* __restrict&>(dst)++,
|
|
- *reinterpret_cast<const long long* __restrict&>(src)++);
|
|
- }
|
|
- size = size % sizeof(long long);
|
|
-
|
|
- for (auto i = 0u; i != size / sizeof(int); ++i) {
|
|
- _mm_stream_si32(reinterpret_cast<int* __restrict&>(dst)++,
|
|
- *reinterpret_cast<const int* __restrict&>(src)++);
|
|
- }
|
|
-}
|
|
-
|
|
// ================================================================================================
|
|
bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
|
|
const amd::Kernel& kernel, const_address parameters, void* eventHandle,
|
|
@@ -3096,7 +3058,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
|
|
argBuffer = reinterpret_cast<address>(allocKernArg(gpuKernel.KernargSegmentByteSize(),
|
|
gpuKernel.KernargSegmentAlignment()));
|
|
// Load all kernel arguments
|
|
- nontemporalMemcpy(argBuffer, parameters,
|
|
+ memcpy(argBuffer, parameters,
|
|
std::min(gpuKernel.KernargSegmentByteSize(),
|
|
signature.paramsSize()));
|
|
}
|