Sized Memory Inputs in gcc Inline Assembly

I learned about a gcc feature when working on some x64 Linux syscall wrappers (again).

It is known that the "memory" clobber prevents gcc from reordering memory accesses across the asm block and invalidates cached memory values in registers. (This only applies to potentially globally accessible memory: local variables that doesn't escape are not flushed.) This is safe, but may be too coarse-grained. Take the following contrived example:

void f0(int *restrict p, int *restrict q) {
    *p = 2, *q = 4;
    *p = 3, *q = 5;
}
void f1(int *restrict p, int *restrict q) {
    *p = 2, *q = 4;
    asm volatile ( "nop" :: "r"(p) : );
    *p = 3, *q = 5;
}
void f2(int *restrict p, int *restrict q) {
    *p = 2, *q = 4;
    asm volatile ( "nop" :: "r"(p) : "memory" );
    *p = 3, *q = 5;
}
void f3(int *restrict p, int *restrict q) {
    *p = 2, *q = 4;
    asm volatile ( "nop" :: "r"(p), "m"(*p) : );
    *p = 3, *q = 5;
}
f0:
        mov     DWORD PTR [rdi], 3
        mov     DWORD PTR [rsi], 5
        ret
f1:
        nop
        mov     DWORD PTR [rdi], 3
        mov     DWORD PTR [rsi], 5
        ret
f2:
        mov     DWORD PTR [rdi], 2
        mov     DWORD PTR [rsi], 4
        nop
        mov     DWORD PTR [rdi], 3
        mov     DWORD PTR [rsi], 5
        ret
f3:
        mov     DWORD PTR [rdi], 2
        nop
        mov     DWORD PTR [rdi], 3
        mov     DWORD PTR [rsi], 5
        ret

Here, nop is a placeholder for code that reads from *p.

Now consider the write syscall, which reads from a buffer passed as a pointer-length pair. Ideally, there should be a read-only memory barrier on the buffer and only the buffer. This is where the gcc feature comes in: you can pass an array as a memory input operand, like "m" (*(const char (*)[10]) p). It would take some incredible work to make it work with alias analysis, I thought.

Except it doesn't quite work the way I expected:

void f0(char *buf, unsigned long len) {
    buf[len] = 3;
    buf[len] = 4;
}
void f1(char *buf, unsigned long len) {
    buf[len] = 3;
    asm volatile ( "nop" :: "m"(*(const char (*)[len]) buf) : );
    buf[len] = 4;
}
f0:
        mov     BYTE PTR [rdi+rsi], 4
        ret
f1:
        mov     BYTE PTR [rdi+rsi], 3
        nop
        mov     BYTE PTR [rdi+rsi], 4
        ret

Oops. Uh, maybe variable lengths are too hard for gcc. What if the length is constant?

#define GEN_F(ot,oi,it,ic) \
    void f_ ## ot ## oi ## _ ## it ## ic(ot *p) {       \
        p[oi] = 3;                                      \
        asm volatile ( "" : : "m"(*(it (*)[ic])p) : );  \
        p[oi] = 4;                                      \
    }

                            // number of stores
GEN_F(int, 2, int, 1);      // 1
GEN_F(int, 2, int, 2);      // 1
GEN_F(int, 2, int, 3);      // 2
GEN_F(int, 2, int, 4);      // 2

GEN_F(int, 40, int, 1);     // 1
GEN_F(int, 40, int, 2);     // 1
GEN_F(int, 40, int, 3);     // 2
GEN_F(int, 40, int, 4);     // 1
GEN_F(int, 40, int, 5);     // 2
GEN_F(int, 40, int, 6);     // 2
GEN_F(int, 40, int, 7);     // 2
GEN_F(int, 40, int, 8);     // 2

GEN_F(int, 40, char, 1);    // 1
GEN_F(int, 40, char, 2);    // 1
GEN_F(int, 40, char, 3);    // 2
GEN_F(int, 40, char, 4);    // 1
GEN_F(int, 40, char, 5);    // 2
GEN_F(int, 40, char, 6);    // 2
GEN_F(int, 40, char, 7);    // 2
GEN_F(int, 40, char, 8);    // 1
GEN_F(int, 40, char, 15);   // 2
GEN_F(int, 40, char, 16);   // 1
GEN_F(int, 40, char, 17);   // 2
GEN_F(int, 40, char, 24);   // 2
GEN_F(int, 40, char, 32);   // 2
GEN_F(int, 40, char, 64);   // 2

From this it appears that the length hint only works when the array has size 1, 2, 4, 8, or 16 bytes. My guess is that this feature was designed for use with vector instructions.

A dead end, but good to know.


I don't know why, but gcc 15.2 somehow generates different code for f1 and f2 below.

void f0(int *restrict p, int *restrict q) {
    *p = 2, ++*q;
    *p = 3, ++*q;
}
void f1(int *restrict p, int *restrict q) {
    *p = 2, ++*q;
    asm volatile ( "nop" :: "r"(p) : );
    *p = 3, ++*q;
}
f0:
        add     DWORD PTR [rsi], 2
        mov     DWORD PTR [rdi], 3
        ret
f1:
        nop
        mov     eax, DWORD PTR [rsi]
        mov     DWORD PTR [rdi], 3
        add     eax, 2
        mov     DWORD PTR [rsi], eax
        ret

Also, one may wonder what clang does about this. I tested clang 22.1.0 and it seems to ignore the array length hint: all GEN_F functions make two stores.