[SPCA] Restructuring, finish memory management in C, start dynamic memory management section

2026-07-28 03:39:08 +02:00 · 2026-01-07 11:54:39 +01:00
parent 2afa0ff161
commit 2c9921c6d1
23 changed files with 248 additions and 66 deletions
@@ -14,6 +14,13 @@ struct MyStruct {
        int el2;
 };

+// Like structs, but can only hold one of the values!
+union MyUnion {
+        int ival;
+        float fval;
+        char *sval;
+};
+
 int fun( int j ) {
    static int i = 0;              // Persists across calls of fun
    short my_var = 1;              // Block scoped (deallocated when going out of scope)
@@ -27,7 +34,10 @@ int main( int argc, char *argv[] ) {
    }
    struct MyStruct test;            // Allocate memory on stack for struct
    struct MyStruct *test_p = &test; // Pointer to memory where test resides
-    test.el1 = 1;                    // Direct element access
-    test_p->el2 = 2;                 // Via pointer
+    struct MyStruct test2;
+    union MyUnion my_uval; // Work exactly like structs for access
+    test.el1 = 1;          // Direct element access
+    test_p->el2 = 2;       // Via pointer
+    test2 = test;          // Copies the struct
    return 0;
 }
@@ -0,0 +1,22 @@
+#include <stdlib.h>
+
+int main( int argc, char *argv[] ) {
+    long *arr = (long *) malloc( 10 * sizeof( long ) ); // Allocate on heap
+    if ( arr == NULL )                                  // Check if successful
+        return EXIT_FAILURE;
+    arr[ 0 ] = 5;
+
+    long *arr2;
+    if ( ( arr2 = (long *) calloc( 10, sizeof( long ) ) ) == NULL )
+        return EXIT_FAILURE; // Same as above, but fewer lines and memory zeroed
+
+    // Reallocate memory (to change size). Always use new pointer and do check!
+    if ( ( arr2 = (long *) realloc( arr2, 15 * sizeof( long ) ) ) == NULL )
+        return EXIT_FAILURE;
+
+    free( arr );  // Deallocate the memory
+    arr = NULL;   // Best practice: NULL pointer
+    free( arr2 ); // *Can* omit NULLing pointer because end
+
+    return EXIT_SUCCESS;
+}
@@ -0,0 +1,15 @@
+#include <stdlib.h>
+
+int main( int argc, char **argv ) {
+    int a[ 2 ];
+    int *b = malloc( 2 * sizeof( int ) ), *c;
+    a[ 2 ] = 5;          // assign past the end of an array
+    b[ 0 ] += 2;         // assume malloc zeroes out memory
+    c = b + 3;           // mess up your pointer arithmetic
+    free( &( a[ 0 ] ) ); // pass pointer to free() that wasn't malloc'ed
+    free( b );
+    free( b );  // double-free the same block
+    b[ 0 ] = 5; // use a free()'d pointer
+    // any many more!
+    return 0;
+}
@@ -1,39 +0,0 @@
-\newpage
-\subsubsection{Declarations}
-We have already seen a few examples for how \texttt{C} handles declarations.
-In concept they are similar (and scoping works the same) to most other \texttt{C}-like programming languages, including \texttt{Java}.
-\inputcodewithfilename{c}{code-examples/00_c/00_basics/}{02_declarations.c}
-
-A peculiarity of \texttt{C} is that the bit-count is not defined by the language, but rather the hardware it is compiled for.
-\begin{fullTable}{llll}{\texttt{C} data type & typical 32-bit & ia32  & x86-64}{Comparison of byte-sizes for each datatype on different architectures}
-    \texttt{char}               & 1              & 1     & 1       \\
-    \texttt{short}              & 2              & 2     & 2       \\
-    \texttt{int}                & 4              & 4     & 4       \\
-    \texttt{long}               & 4              & 4     & 8       \\
-    \texttt{long long}          & 8              & 8     & 8       \\
-    \texttt{float}              & 4              & 4     & 4       \\
-    \texttt{double}             & 4              & 8     & 8       \\
-    \texttt{long double}        & 8              & 10/12 & 16      \\
-\end{fullTable}
-
-\drmvspace
-By default, integers in \lC\ are \texttt{signed}, to declare an unsigned integer, use \texttt{unsigned int}.
-Since it is hard and annoying to remember the number of bytes that are in each data type, \texttt{C99} has introduced the extended integer types,
-which can be imported from \texttt{stdint.h} and are of form \texttt{int<bit count>\_t} and \texttt{uint<bit count>\_t},
-where we substitute the \texttt{<bit count>} with the number of bits (have to correspond to a valid type of course).
-
-Another notable difference of \texttt{C} compared to other languages is that \texttt{C} doesn't natively have a \texttt{boolean} type,
-by convention a \texttt{short} is used to represent it, where any non-zero value means \texttt{true} and \texttt{0} means \texttt{false}.
-Since boolean types are quite handy, the \texttt{!} syntax for negation turns any non-zero value of any integer type into zero and vice-versa.
-\texttt{C99} has added support for a bool type via \texttt{stdbool.h}, which however is still an integer.
-
-Notably, \texttt{C} doesn't have a very rigid type system and lower bit-count types are implicitly cast to higher bit-count data types, i.e.
-if you add a \texttt{short} and an \texttt{int}, the \texttt{short} is cast to \texttt{short} (bits 16-31 are set to $0$) and the two are added.
-Explicit casting between almost all types is also supported.
-Some will force a change of bit representation, but most won't (notably, when casting to and from \texttt{float}-like types, minus to \texttt{void})
-
-Another important feature is that every \lC\ statement is also an expression, see above code block for example.
-
-The \texttt{void} type has \bi{no} value and is used for untyped pointers and declaring functions with no return value
-
-It is also possible to define a custom type using \texttt{typedef <type it represents> <name of the new type>}
@@ -0,0 +1,86 @@
+\newpage
+\subsubsection{Declarations}
+We have already seen a few examples for how \texttt{C} handles declarations.
+In concept they are similar (and scoping works the same) to most other \texttt{C}-like programming languages, including \texttt{Java}.
+
+\inputcodewithfilename{c}{code-examples/00_c/00_basics/}{02_declarations.c}
+
+\newpage
+A peculiarity of \texttt{C} is that the bit-count is not defined by the language, but rather the hardware it is compiled for.
+\rmvspace
+
+\begin{fullTable}{llll}{\texttt{C} data type & typical 32-bit & ia32  & x86-64}{Comparison of byte-sizes for each datatype on different architectures}
+    \texttt{char}               & 1              & 1     & 1       \\
+    \texttt{short}              & 2              & 2     & 2       \\
+    \texttt{int}                & 4              & 4     & 4       \\
+    \texttt{long}               & 4              & 4     & 8       \\
+    \texttt{long long}          & 8              & 8     & 8       \\
+    \texttt{float}              & 4              & 4     & 4       \\
+    \texttt{double}             & 4              & 8     & 8       \\
+    \texttt{long double}        & 8              & 10/12 & 16      \\
+\end{fullTable}
+
+\drmvspace
+\warn{Type format} Be however aware that this table uses the \texttt{LP64} format for the x86-64 sizes
+and this is the format all UNIX-Systems use (i.e. Linux, BSD, Darwin (the Mac Kernel)).
+64 bit Windows however uses \texttt{LLP64}, i.e. \texttt{int} and \texttt{long} have the same size (32) and \texttt{long long} and pointers are 64 bit.
+
+
+\content{Integers} By default, integers in \lC\ are \texttt{signed}, to declare an unsigned integer, use \texttt{unsigned int}.
+Since it is hard and annoying to remember the number of bytes that are in each data type, \texttt{C99} has introduced the extended integer types,
+which can be imported from \texttt{stdint.h} and are of form \texttt{int<bit count>\_t} and \texttt{uint<bit count>\_t},
+where we substitute the \texttt{<bit count>} with the number of bits (have to correspond to a valid type of course).
+
+
+\content{Booleans} Another notable difference of \texttt{C} compared to other languages is that \texttt{C} doesn't natively have a \texttt{boolean} type,
+by convention a \texttt{short} is used to represent it, where any non-zero value means \texttt{true} and \texttt{0} means \texttt{false}.
+Since boolean types are quite handy, the \texttt{!} syntax for negation turns any non-zero value of any integer type into zero and vice-versa.
+\texttt{C99} has added support for a bool type via \texttt{stdbool.h}, which however is still an integer.
+
+
+\content{Implicit casts} Notably, \texttt{C} doesn't have a very rigid type system and lower bit-count types are implicitly cast to higher bit-count data types, i.e.
+if you add a \texttt{short} and an \texttt{int}, the \texttt{short} is cast to \texttt{short} (bits 16-31 are set to $0$) and the two are added.
+Explicit casting between almost all types is also supported.
+Some will force a change of bit representation, but most won't (notably, when casting to and from \texttt{float}-like types, minus to \texttt{void})
+
+
+\content{Expressions} Every \lC\ statement is also an expression, see above code block for example.
+
+
+\content{Void} The \texttt{void} type has \bi{no} value and is used for untyped pointers and declaring functions with no return value
+
+
+\content{Structs} Are like classes in OOP, but they contain no logic.
+We can assign copy a struct by assignment and they behave just like everything else in \texttt{C} when used as an argument for functions
+in that they are passed by value and not by reference.
+You can of course pass it also by reference (like any other data type) by setting the argument to type \texttt{struct mystruct * name} and then calling the function using
+\texttt{func(\&test)} assuming \texttt{test} is the name of your struct
+
+
+\content{Typedef} To define a custom type using \texttt{typedef <type it represents> <name of the new type>}.
+
+You may also use \texttt{typedef} on structs using \texttt{typedef struct <struct tag> <name of the new alias>},
+you can thus instead of e.g. \verb|struct list_el my_list;| write \verb|list my_list;|, if you have used \verb|typedef struct list_el list;| before.
+It is even possible to do this:
+\drmvspace
+\begin{code}{c}
+    typedef struct list_el {
+        unsigned long val;
+        struct list_el *next;
+    } list_el;
+
+    struct list_el my_list;
+    list_el my_other_list;
+\end{code}
+\rmvspace
+
+\content{Namespaces}
+\lC\ has a few different namespaces, i.e. you can have the one of the same name in each namespace (i.e. you can have \texttt{struct a}, \texttt{int a}, etc).
+The following namespaces were covered:
+\rmvspace
+\begin{itemize}[noitemsep]
+    \item Label names (used for \texttt{goto})
+    \item Tags (for \texttt{struct}, \texttt{union} and \texttt{enum})
+    \item Member names one namespace for each \texttt{struct}, \texttt{union} and \texttt{enum}
+    \item Everything else mostly (types, variable names, etc, including typedef)
+\end{itemize}
@@ -1,3 +1,4 @@
+\newpage
 \subsubsection{Operators}
 The list of operators in \lC\ is similar to the one of \texttt{Java}, etc.
 In Table \ref{tab:c-operators}, you can see an overview of the operators, sorted by precedence in descending order.
@@ -18,7 +18,7 @@ The (Linux)-Kernel randomizes the address space to prevent some common exploits.
    Some pointer arithmetic has already appeared in section \ref{sec:c-arrays}, but same kind of content with better explanation can be found here
 \end{scriptsize}

-Note that when doing pointer arithmetic, adding $1$ will move the pointer by \texttt{sizeof(type)} bits.
+\content{Pointer Arithmetic} Note that when doing pointer arithmetic, adding $1$ will move the pointer by \texttt{sizeof(type)} bits.

 You may use pointer arithmetic on whatever pointer you'd like (as long as it's not a null pointer).
 This means, you \textit{can} make an array wherever in memory you'd like.
@@ -30,22 +30,24 @@ in the docs mention that one gets undefined behaviour if you do not do as it say
 As already seen in the section arrays (section \ref{sec:c-arrays}), we can use pointer arithmetic for accessing array elements.
 The array name is treated as a pointer to the first element of the array, except when:
 \begin{itemize}[noitemsep]
-    \item it is operand of \texttt{sizeof} (return value is $n \cdot \texttt{ sizeof(type)}$ with $n$ the number of elements)
+    \item it is operand of \texttt{sizeof} (return value is $n \cdot \texttt{sizeof(type)}$ with $n$ the number of elements)
    \item its address is taken (then \texttt{\&a == a})
    \item it is a string literal initializer. If we modify a pointer \texttt{char *b = "String";} to string literal in code,
          the \texttt{"String"} is stored in the code segment and if we modify the pointer, we get undefined behaviour
 \end{itemize}
-\shade{orange}{Fun fact}: \texttt{A[i]} is always rewritten \texttt{*(A + i)} by compiler.
+\shade{purple}{Fun fact}: \texttt{A[i]} is always rewritten \texttt{*(A + i)} by compiler.

-Another important aspect is passing by value or by reference.
-You can pass every data type by reference, you can not however pass an array by value.
+\content{Function arguments} Another important aspect is passing by value or by reference.
+You can pass every data type by reference, you can not however pass an array by value (as an array is treated as a pointer, see above).

-Another interesting concept that \lC\ has to offer is body-less loops:
+\content{Body-less loops}
+\rmvspace
 \begin{code}{c}
    int x = 0;
    while ( x++ < 10 ); // This is (of course) not a useful snippet, but shows the concept
 \end{code}

-\lC\ also has an option to pass functions as arguments to functions, called function pointers.
-A function is passed using the typical address syntax with the \verb|&| symbol is annotated as argument using \verb|type (* name)(type arg1, ...)|
+\content{Function pointers}
+A function can be passed as an argument to another function using the typical address syntax with the \verb|&| symbol is annotated as argument using
+\verb|type (* name)(type arg1, ...)|
 and is called using \verb|(*func)(arg1, ...)|.
@@ -0,0 +1,26 @@
+\subsection{Memory}
+In comparison to most other languages, \lC\ does not feature automatic memory management, but instead gives us full, manual control over memory.
+This of course has both advantages and disadvantages.
+
+\rmvspace
+\inputcodewithfilename{c}{code-examples/00_c/02_memory/}{00_memory.c}
+\drmvspace
+
+Notably, the argument \texttt{size\_t sz} for \texttt{malloc}, \texttt{calloc} and \texttt{realloc} is an \texttt{unsigned} integer of some size
+and differs depending on hardware and software platforms.
+
+\texttt{malloc} keeps track of which blocks are allocated. If you give \texttt{free} a pointer that isn't the start of the memory region previously \texttt{malloc}'d,
+you get undefined behaviour.
+
+\warn{Memory corruption} There are many ways to corrupt memory in \lC. The below code shows off a few of them:
+
+\rmvspace
+\inputcodewithfilename{c}{code-examples/00_c/02_memory/}{01_mem-corruption.c}
+\drmvspace
+
+\warn{Memory leaks} If we allocate memory, but never free it, we use more and more memory (old memory is inaccessible)
+
+\content{Dynamic data structures} We build it using structs that have a pointer to another struct inside them.
+We have to allocate memory for each element and then add the pointer to another struct.
+For a generic dynamic data structure, make the element a \texttt{void} pointer.
+This in general is the concept used for functions operating on any data type.
@@ -0,0 +1,37 @@
+\subsubsection{Dynamic Memory Allocation}
+Memory allocated with \texttt{malloc} is typically $8$- or $16$-byte aligned.
+
+\content{Explicit vs. Implicit} In explicit memory management, the application does both the allocation \textit{and} deallocation memory,
+whereas in implicit memory management, the application allocates the memory, but usually a \textit{Garbage Collector} (GC) frees it.
+
+For some languages, like Rust, one would assume that it does implicit allocation, but Rust is a language using explicit management,
+it's just that the \textit{compiler} and not the programmer decides when to allocate and when to deallocate.
+
+\warn{Assumptions in this course} We assume that memory is \bi{word} addressed (= 8 Bytes).
+
+\content{Goals} The allocation should have the highest possible throughput and at the same time the best (i.e. lowest) possible memory utilization.
+This however is usually conflicting, so we have to balance the two.
+
+\numberingOff
+\inlinedef \bi{Aggregate payload} $P_k$: All \texttt{malloc}'d stuff minus all \texttt{free}'d stuff
+
+\inlinedef \bi{Current heap size} $H_k$: Monotonically non-decreasing. Grows when \texttt{sbrk} system call is issued.
+
+\inlinedef \bi{Peak memory utilization} $U_k = (\max_{i < k} P_i) / H_k$
+
+
+A bit problem for the \texttt{free} function is to know how much memory to free without knowing the size of the to be freed block.
+This is just one of many other implementation issues:
+\begin{itemize}
+    \item How do we keep track of the free blocks? I.e. where and how large are they?
+    \item What do we do with the extra space of a block when allocating a smaller block?
+    \item How do we pick a block?
+    \item How do we reinsert a freed block into the heap?
+\end{itemize}
+This all leads to an issue known as \bi{fragmentation}
+
+\inlinedef \bi{Internal Fragmentation}: If for a given block the payload (i.e. the requested size) is smaller than the block size.
+This depends on the pattern of previous requests and is thus easy to measure
+
+\inlinedef \bi{External Fragmentation}: There is enough aggregate heap memory, but there isn't a single large enough free block available
+This depends on the pattern of future requests and is thus hard to measure
@@ -8,6 +8,9 @@
 \setFontType{sans}

 \newcommand{\lC}{\texttt{C}}
+\newcommand{\content}[1]{\shade{blue}{#1}}
+\newcommand{\warn}[1]{\bg{orange}{#1}}
+\newcommand{\danger}[1]{\shade{red}{#1}}

 \begin{document}
 \startDocument
@@ -56,31 +59,50 @@
 %          ╭────────────────────────────────────────────────╮
 %          │                    Content                     │
 %          ╰────────────────────────────────────────────────╯
+% ── Intro to x86 asm ────────────────────────────────────────────────
+\newsection
+\section{Introduction}
+This summary tries to summarize everything that is important to know for this course.
+It aims to be a full replacement for the slides, but as with all my summaries, there may be missing or incorrect information in here,
+so use at your own risk. You have been warned!
+
+The summary does \textit{not} follow the order the lecture does.
+This is to make related information appear more closely to each other than they have in the lecture and the summary assumes you have already seen
+the concepts in the lectures or elsewhere (or are willing to be thrown in the deep end).
+
+The target semester for this summary is HS2025, so there might have been changes in your year.
+If there are changes and you'd like to update this summary, please open a pull request in the summary's repo at
+
+\begin{center}
+    \hlurl{https://github.com/janishutz/eth-summaries}
+\end{center}
+
+
+\newsection
+\section{x86 Assembly}
+\input{parts/00_asm/00_intro.tex}
+
 % ── Intro to C ──────────────────────────────────────────────────────
 \newsection
 \section{The C Programming Language}
-\input{parts/00_c/00_intro.tex}
-\input{parts/00_c/01_basics/00_intro.tex}
-\input{parts/00_c/01_basics/01_control-flow.tex}
-\input{parts/00_c/01_basics/02_declarations.tex}
-\input{parts/00_c/01_basics/03_operators.tex}
-\input{parts/00_c/01_basics/04_arrays.tex}
-\input{parts/00_c/01_basics/05_strings.tex}
-\input{parts/00_c/01_basics/06_integers.tex}
-\input{parts/00_c/01_basics/07_pointers.tex}
-\input{parts/00_c/02_preprocessor.tex}
-
-
-% ── Intro to x86 asm ────────────────────────────────────────────────
-\newsection
-\section{x86 Assembly}
-\input{parts/01_asm/00_intro.tex}
+\input{parts/01_c/00_intro.tex}
+\input{parts/01_c/01_basics/00_intro.tex}
+\input{parts/01_c/01_basics/01_control-flow.tex}
+\input{parts/01_c/01_basics/02_declarations.tex}
+\input{parts/01_c/01_basics/03_operators.tex}
+\input{parts/01_c/01_basics/04_arrays.tex}
+\input{parts/01_c/01_basics/05_strings.tex}
+\input{parts/01_c/01_basics/06_integers.tex}
+\input{parts/01_c/01_basics/07_pointers.tex}
+\input{parts/01_c/02_preprocessor.tex}
+\input{parts/01_c/03_memory/00_intro.tex}
+\input{parts/01_c/03_memory/01_allocation.tex}


 % ── Hardware recap ──────────────────────────────────────────────────
 \newsection
 \section{Hardware}
-\input{parts/02_hw/00_intro.tex}
+\input{parts/03_hw/00_intro.tex}

 Remember: Rust and the like have an \texttt{unsafe} block... \lC's equivalent to this is
 \begin{code}{c}