This is a simple patch that replaces movnti with movq and clwb. It is suitable only for benchmarking (it doesn't handle alignment and page faults well). Signed-off-by: Mikulas Patocka --- arch/x86/lib/copy_user_64.S | 17 +++++++++-------- arch/x86/lib/usercopy_64.c | 25 +++++++++++++++++-------- 2 files changed, 26 insertions(+), 16 deletions(-) Index: linux-nova/arch/x86/lib/copy_user_64.S =================================================================== --- linux-nova.orig/arch/x86/lib/copy_user_64.S 2020-03-24 15:17:09.792758258 -0400 +++ linux-nova/arch/x86/lib/copy_user_64.S 2020-03-29 04:51:03.329260149 -0400 @@ -224,18 +224,19 @@ ENTRY(__copy_user_nocache) 2: movq 1*8(%rsi),%r9 3: movq 2*8(%rsi),%r10 4: movq 3*8(%rsi),%r11 -5: movnti %r8,(%rdi) -6: movnti %r9,1*8(%rdi) -7: movnti %r10,2*8(%rdi) -8: movnti %r11,3*8(%rdi) +5: movq %r8,(%rdi) +6: movq %r9,1*8(%rdi) +7: movq %r10,2*8(%rdi) +8: movq %r11,3*8(%rdi) 9: movq 4*8(%rsi),%r8 10: movq 5*8(%rsi),%r9 11: movq 6*8(%rsi),%r10 12: movq 7*8(%rsi),%r11 -13: movnti %r8,4*8(%rdi) -14: movnti %r9,5*8(%rdi) -15: movnti %r10,6*8(%rdi) -16: movnti %r11,7*8(%rdi) +13: movq %r8,4*8(%rdi) +14: movq %r9,5*8(%rdi) +15: movq %r10,6*8(%rdi) +16: movq %r11,7*8(%rdi) + clwb (%rdi) leaq 64(%rsi),%rsi leaq 64(%rdi),%rdi decl %ecx Index: linux-nova/arch/x86/lib/usercopy_64.c =================================================================== --- linux-nova.orig/arch/x86/lib/usercopy_64.c 2020-03-24 15:17:09.794758318 -0400 +++ linux-nova/arch/x86/lib/usercopy_64.c 2020-03-29 04:51:16.841669119 -0400 @@ -172,20 +172,29 @@ void __memcpy_flushcache(void *_dst, con } /* 4x8 movnti loop */ - while (size >= 32) { + while (size >= 64) { asm("movq (%0), %%r8\n" "movq 8(%0), %%r9\n" "movq 16(%0), %%r10\n" "movq 24(%0), %%r11\n" - "movnti %%r8, (%1)\n" - "movnti %%r9, 8(%1)\n" - "movnti %%r10, 16(%1)\n" - "movnti %%r11, 24(%1)\n" + "movq %%r8, (%1)\n" + "movq %%r9, 8(%1)\n" + "movq %%r10, 16(%1)\n" + "movq %%r11, 24(%1)\n" + "movq 32(%0), %%r8\n" + "movq 40(%0), %%r9\n" + "movq 48(%0), %%r10\n" + "movq 56(%0), %%r11\n" + "movq %%r8, 32(%1)\n" + "movq %%r9, 40(%1)\n" + "movq %%r10, 48(%1)\n" + "movq %%r11, 56(%1)\n" + "clwb (%1)\n" :: "r" (source), "r" (dest) : "memory", "r8", "r9", "r10", "r11"); - dest += 32; - source += 32; - size -= 32; + dest += 64; + source += 64; + size -= 64; } /* 1x8 movnti loop */