update 3.md and 4.md

2025-10-22 03:04:38 +09:00
parent a86855a11c
commit 6f41ae9730
3 changed files with 349 additions and 17 deletions
--- a/notes/3.md
+++ b/notes/3.md
@@ -52,7 +52,7 @@ void sumstore(long x, long y, long *dest) {
 ```

 ```sh {cmd hide}
-while ![ -r 3_1.o ]; do sleep .1; done; objdump -d 3_1.o
+while ! [ -r 3_1.o ]; do sleep .1; done; objdump -d 3_1.o
 ```

 ### Integer Registers
@@ -267,7 +267,7 @@ long absdiff(long x, long y) {
 ```

 ```sh { cmd hide }
-while ![ -r 3_3.o ]; do sleep .1; done; objdump -d 3_3.o -Msuffix
+while ! [ -r 3_3.o ]; do sleep .1; done; objdump -d 3_3.o -Msuffix
 ```

 **expressing with `goto`**
@@ -300,7 +300,7 @@ long absdiff(long x, long y) {
 ```

 ```sh {cmd hide}
-while ![ -r 3_5.o ]; do sleep .1; done; objdump -d 3_5.o -Msuffix
+while ! [ -r 3_5.o ]; do sleep .1; done; objdump -d 3_5.o -Msuffix
 ```

 However, there are several *bad cases* for conditional move.
@@ -357,7 +357,7 @@ loop:
 </table>

 ```sh {cmd hide}
-while ![ -r 3_6.o ]; do sleep .1; done; objdump -d 3_6.o -Msuffix
+while ! [ -r 3_6.o ]; do sleep .1; done; objdump -d 3_6.o -Msuffix
 ```

 **general do-while translation**
@@ -426,7 +426,7 @@ long pcount_while(unsigned long x) {
 ```
 ```sh {cmd hide}
 echo "jmp-to-middle translation"
-while ![ -r 3_7.o ]; do sleep .1; done; objdump -d 3_7.o -Msuffix
+while ! [ -r 3_7.o ]; do sleep .1; done; objdump -d 3_7.o -Msuffix
 ```

 **general while translation#2**
@@ -478,7 +478,7 @@ long pcount_while(unsigned long x) {
 ```
 ```sh {cmd hide}
 echo "while to do-while conversion"
-while ![ -r 3_8.o ]; do sleep .1; done; objdump -d 3_8.o -Msuffix
+while ! [ -r 3_8.o ]; do sleep .1; done; objdump -d 3_8.o -Msuffix
 ```

 #### for loop form
@@ -560,13 +560,13 @@ long pcount_for(unsigned long x) {
 <td>

 ```sh {cmd hide}
-while ![ -r 3_9.o ]; do sleep .1; done; objdump -d 3_9.o -Msuffix
+while ! [ -r 3_9.o ]; do sleep .1; done; objdump -d 3_9.o -Msuffix
 ```
 </td>
 <td>

 ```sh {cmd hide}
-while ![ -r 3_10.o ]; do sleep .1; done; objdump -d 3_10.o -Msuffix
+while ! [ -r 3_10.o ]; do sleep .1; done; objdump -d 3_10.o -Msuffix
 ```
 </td>
 </tr>
@@ -614,7 +614,7 @@ long switch_eg (long x, long y, long z) {
 <td>

 ```sh {cmd hide}
-while ![ -r 3_11.s ]; do sleep .1; done; cat 3_11.s
+while ! [ -r 3_11.s ]; do sleep .1; done; cat 3_11.s
 ```
 </td>
 </tr>
@@ -667,7 +667,7 @@ void multstore(long x, long y, long *dest) {
 ```

 ```sh {cmd hide}
-while ![ -r 3_12.o ]; do sleep .1; done; objdump -d 3_12.o -Msuffix
+while ! [ -r 3_12.o ]; do sleep .1; done; objdump -d 3_12.o -Msuffix
 ```

 Procedure call `call label`
@@ -687,7 +687,7 @@ Procedure return: `ret`
 for example with above example

 ```sh {cmd hide}
-while ![ -r 3_12.o ]; do sleep .1; done; objdump -d 3_12.o -Msuffix
+while ! [ -r 3_12.o ]; do sleep .1; done; objdump -d 3_12.o -Msuffix
 ```

 * with above `mult2` variable `t` is already stored in `%rax`
@@ -718,6 +718,38 @@ Deallocated when return, "finish" code and includes pop by `ret`.

 #### x86-64/Linux Stack Frame

+![stack frame image](/assets/3_1stackframe.png)
+
+* Arguments
+* Local variables
+* Old `rbp`
+
+### Register Saving Conventions
+
+When calling function, the temporary value of registers could be removed by called function, it could be trouble. So there are **conventions** to save the registers value.
+
+When procedure `yoo` calls `who`: `yoo` is `caller`, `who` is `callee`
+* Caller saves temporary values in its frame before the call.
+* Callee saves saves temporary values in its frame before using and restores them before returning to caller.
+
+
+#### x86-64 Linux Register Usage
+
+`%rbx`, `%r12`, `%r13`, `%r14`, `%r15`
+* Callee-saved
+* Callee must save & restore
+
+`%rbp`
+* Callee-saved
+* Callee must save & restore
+* May be used as frame pointer by callee
+* Can mix & match
+
+`%rsp`
+* Special form of callee-saved
+* Restored to original value upon exit from procedure
+
+#### EX 

 * for compile w/o *stack canary*, add option `-fno-stack-protector`
 ```c {cmd=gcc args=[-Og -x c -fno-stack-protector -c $input_file -o 3_13.o]}
@@ -735,5 +767,27 @@ long call_incr() {
 ```

 ```sh {cmd hide}
-while ![ -r 3_13.o ]; do sleep .1; done; objdump -d 3_13.o -Msuffix
-```
+while ! [ -r 3_13.o ]; do sleep .1; done; objdump -d 3_13.o -Msuffix
+```
+
+### Recursive Function
+
+```c {cmd=gcc args=[-O1 -x c -fno-stack-protector -c $input_file -o 3_14.o]}
+long pcount_r(unsigned long x) {
+  if (x == 0) {
+    return 0;
+  } else {
+    return (x & 1) + pcount_r(x >> 1);
+  }
+}
+```
+
+```sh {cmd hide}
+while ! [ -r 3_14.o ]; do sleep .1; done; objdump -d 3_14.o -Msuffix
+```
+
+Recursion is not a special function.
+* Stack frames mean that each function call has private storage.
+* Register saving conventions prevent one function call from corrupting another's data. *unless the explictly corrupting like buffer overflow*
+* Stack discipline follows call/return pattern LIFO
+
--- a/notes/4.md
+++ b/notes/4.md
@@ -1,6 +1,284 @@
-# Machine Level Programming
+# Optimization

-아키텍쳐(ISA)
-* intel(x86): CISC
-* ARM(aarch64, aarch32): RISC
+There's more to performance than asymptotic complexity(time complexity).
+
+But all the instructions are not consume the same amount of time. Constant factors matter too! So we need to understand system to optimize performance.
+* How programs are compiled and executed
+* How modern processors and memory system operate
+* How to measure performance and identify bottlenecks
+* How to improve performance without destroying code modularity and generality
+
+Provide efficent mapping of program to machine code
+* Register allocation
+* Code selection and ordering (scheduling)
+* Dead code elimination
+* Elimininating minor inefficiencies
+
+**Don't improve asymptotic efficiency**.
+
+## Generally Useful Optimizations
+
+### Code Motion(Hoisting)
+
+Reduce frequecy where computation performed. If it will always produce the same result, then move it to a place where it is computed once and reused.
+Especially moving code out of loop.
+
+```c {cmd=gcc args=[-Og -x c -c $input_file -o 4_1.o]}
+void set_row(double *a, double *b, long i, long n) {
+    long j;
+    for (j = 0; j < n; j++) {
+        a[i * n + j] = b[j];
+    }
+}
+```
+
+<table>
+<tr><th>Default</th><th>Optimized</th></tr>
+<tr><td>
+
+```c {cmd=gcc args=[-O1 -x c -c $input_file -o 4_2.o]}
+void set_row(double *a, double *b, long i, long n) {
+    long j;
+    for (j = 0; j < n; j++) {
+        a[i * n + j] = b[j];
+    }
+}
+```
+</td><td>
+
+```c
+void set_row_opt(double *a, double *b, long i, long n) {
+    long j;
+    int ni = n * i;
+    for (j = 0; j < n; j++) {
+        a[ni + j] = b[j];
+    }
+}
+```
+</td></tr>
+<tr>
+<td>
+
+```sh {cmd hide}
+while ! [ -r 4_1.o ]; do sleep .1; done; objdump -d 4_1.o
+```
+`imul` is located in the loop.
+</td>
+<td>
+
+```sh {cmd hide}
+while ! [ -r 4_2.o ]; do sleep .1; done; objdump -d 4_2.o
+```
+can see that `imul` is located out of the loop. 
+</td>
+</tr>
+</table>
+
+Above two codes have same number of instructions. But optimized version has **fewer executed instructions**.
+
+GCC will do this with `-O1` options
+
+### Reduction in Strength
+
+Replace costly operation with simpler one.
+
+for example: power of 2 multiply to shift operation. normally, multiply and divide are expensive exmaple. on Intel Nehalem, `imul` requires 3 CPU cylcles on the other hand, `add` requires 1 cycle.
+
+<table>
+<tr><th>Default</th><th>Optimized</th></tr>
+<tr><td>
+
+```c
+void test_reduction(double *a, double *b, long i, long n) {
+    int i, j;
+    for (i = 0;i < n; i++) {
+        int ni = n * i;
+        for (j = 0; j < n; j++) {
+            a[ni + j] = b[j];
+        }
+    }
+}
+```
+</td><td>
+
+```c
+void test_reduction_opt(double *a, double *b, long i, long n) {
+    int i, j;
+    int ni = 0;
+    for (i = 0;i < n; i++) {
+        for (j = 0; j < n; j++) {
+            a[ni + j] = b[j];
+        }
+        ni += n;
+    }
+}
+```
+</td></tr>
+</table>
+
+### Share Common Subexpressions
+
+Reuse portations of expressions
+
+GCC will do this with `-O1`
+
+<table>
+<tr><th>Default</th><th>Optimized</th></tr>
+<tr><td>
+
+```c {cmd=gcc args=[-O1 -x c -c $input_file -o 4_3.o]}
+double test_scs(double* val, long i, long j, long n) {
+    double up, down, left, right;
+
+    up = val[(i - 1) * n + j];
+    down = val[(i + 1) * n + j];
+    left = val[i * n + (j - 1)];
+    right = val[i * n + (j + 1)];
+    return up + down + left + right;
+}
+```
+</td><td>
+
+```c
+double test_scs_opt(double *a, double *b, long i, long n) {
+    double up, down, left, right;
+    
+    long inj = i * n + j;
+
+    up = a[inj - n];
+    down = a[inj + n];
+    left = b[inj - 1];
+    right = b[inj + 1];
+    return up + down + left + right;
+}
+```
+</td></tr>
+</table>
+
+```sh {cmd hide}
+while ! [ -r 4_3.o ]; do sleep .1; done; objdump -d 4_3.o
+```
+
+Above dump shows only one `imul`, which shows that share common subexpressions are applied.
+
+### Remove Unnecessary Procedure
+
+Think with your intuition.
+
+## Optimization Blockers
+
+Compilers cannot always optimize your code.
+
+```c
+void lower(char *s) {
+    size_t i;
+    for (i = 0; i < strlen(s); i++) {
+        if (s[i] >= 'A' && s[i] <= 'Z') {
+            s[i] -= ('A' - 'a');
+        }
+    }
+}
+```
+
+Above code's performance is bad. time quadruples when double string length.
+Because `strlen` is executed on every loop. so `strlen` is $O(n)$, therefore overall performance of `lower` is $O(n^2)$
+
+Therefore we optimized by Code Motion by moving the calculation length parts to out of the loop.
+
+```c
+void lower(char *s) {
+    size_t i;
+    size_t len = strlen(s);
+    for (i = 0; i < len; i++) {
+        if (s[i] >= 'A' && s[i] <= 'Z') {
+            s[i] -= ('A' - 'a');
+        }
+    }
+}
+```
+
+### #1 Procedure Calls
+
+Procedure may have side effects. and Function may not return same value for given arguments.
+
+So compiler treats procedure call as a black box. Weak optimizations near them. Therefore strong optimizations like **Code Motion** are not applied.
+
+In order to apply strong optimizations, First, use of inline function with `-O1` option, or **do your self**.
+
+### Memory Aliasing
+
+```c {cmd=gcc args=[-O1 -x c -c $input_file -o 4_4.o]}
+void sum_rows(double *a, double *b, long n) {
+    long i, j;
+    for (i = 0; i < n; i++) {
+        b[i] = 0;
+        for (j = 0; j < n; j++) {
+            b[i] += a[i * n + j];
+        }
+    }
+}
+```
+```sh {cmd hide}
+while ! [ -r 4_4.o ]; do sleep .1; done; objdump -d 4_4.o -Msuffix
+```
+
+Compiler leave `b[i]` on every iteration. Because compiler must consider possibility that the updates will affect program behavior. (`b` and `a` is shared, memory aliasing)
+
+Memory aliasing means two different memory references specify single location.
+in C, it is easy to have happen. because address arithmetic and direct access to storage structures.
+
+```c {cmd=gcc args=[-O1 -x c -c $input_file -o 4_5.o]}
+void sum_rows(double *a, double *b, long n) {
+    long i, j;
+    for (i = 0; i < n; i++) {
+        double val = 0;
+        for (j = 0; j < n; j++) {
+            val += a[i * n + j];
+        }
+        b[i] = val;
+    }
+}
+```
+```sh {cmd hide}
+while ! [ -r 4_5.o ]; do sleep .1; done; objdump -d 4_5.o -Msuffix
+```
+
+By introducing local local variables, we can easy to get optimized code.
+
+## Exploiting Instruction-Level Parallelism(ILP)
+
+Execute multiple instructions at the same time. it can reduce average instruction cycle, which needs general understanding of modern processor design: HW can execute many operations in parallel.
+
+* performance limited by data dependency
+
+simple transformations can yield dramatic performance improvement.
+
+### Superscalar Processors
+
+Issue and Execute Multiple Instructions in one cycle.
+
+pipelining -> data dependency.
+
+
+for example Haswell CPU Functional Units
+* 2 load
+* 1 store
+* 4 integer
+* 2 FP mult
+* 1 FP add
+* 1 FP div
+* 1 int mult
+
+### Programming with AVX2
+
+YMM register: 256bit, total 16 registers.
+
+**SIMD Operations**
+
+for single precision
+`vaddps %ymm0, %ymm1, %ymm1`:
+
+for double precision
+
+`vaddpd %ymm0, %ymm1, %ymm1`