diff --git a/camera/Camera.h b/camera/Camera.h
index 62246b8..e102f34 100644
--- a/camera/Camera.h
+++ b/camera/Camera.h
@@ -20,6 +20,8 @@
 // @todo Please check out if we can switch to quaternions. We tried but failed.
 
 struct Camera {
+    bool is_changed;
+
     v3_f32 location;
     v4_f32 orientation;
 
@@ -39,6 +41,8 @@ struct Camera {
     f32 znear;
     f32 zfar;
     f32 aspect;
+
+    f32 view[16];
 };
 
 void
@@ -48,17 +52,19 @@ camera_update_vectors(Camera* camera)
     camera->front.x = cos_ori_x * cosf(OMS_DEG2RAD(camera->orientation.y));
     camera->front.y = sinf(OMS_DEG2RAD(camera->orientation.x));
     camera->front.z = cos_ori_x * sinf(OMS_DEG2RAD(camera->orientation.y));
-    vec3_normalize(&camera->front);
 
     vec3_cross(&camera->right, &camera->front, &camera->world_up);
-    vec3_normalize(&camera->right);
-
     vec3_cross(&camera->up, &camera->right, &camera->front);
+
+    // We checked if combining these 3 into a single SIMD function, but it was slower
+    vec3_normalize(&camera->right);
+    vec3_normalize(&camera->front);
     vec3_normalize(&camera->up);
 }
 
 void camera_rotate(Camera* camera, int32 dx, int32 dy, f32 dt)
 {
+    camera->is_changed = true;
     camera->orientation.x += dy * camera->sensitivity;
     camera->orientation.y -= dx * camera->sensitivity;
 
@@ -82,6 +88,7 @@ void camera_rotate(Camera* camera, int32 dx, int32 dy, f32 dt)
 // you can have up to 4 camera movement inputs at the same time
 void camera_movement(Camera* camera, CameraMovement* movement, f32 dt, bool relative_to_world = true)
 {
+    camera->is_changed = true;
     f32 velocity = camera->speed * dt;
 
     if (relative_to_world) {
@@ -137,10 +144,11 @@ void camera_movement(Camera* camera, CameraMovement* movement, f32 dt, bool rela
 
         v3_f32 right;
         vec3_cross(&right, &camera->world_up, &forward);
-        vec3_normalize(&right);
 
         v3_f32 up;
         vec3_cross(&up, &right, &forward);
+
+        vec3_normalize(&right);
         vec3_normalize(&up);
 
         for (int32 i = 0; i < CAMERA_MAX_INPUTS; i++) {
@@ -275,10 +283,8 @@ void camera_translation_matrix_sparse_lh(const Camera* __restrict camera, f32* t
     translation[11] = camera->location.z;
 }
 
-// @performance This function might be optimizable with simd?
-//  the normalization might also be not required?
 void
-camera_view_matrix_lh(const Camera* __restrict camera, f32* __restrict view)
+camera_view_matrix_lh(Camera* __restrict camera)
 {
     v3_f32 zaxis = { camera->front.x, camera->front.y, camera->front.z };
 
@@ -289,28 +295,28 @@ camera_view_matrix_lh(const Camera* __restrict camera, f32* __restrict view)
     v3_f32 yaxis;
     vec3_cross(&yaxis, &zaxis, &xaxis);
 
-    view[0] = xaxis.x;
-    view[1] = yaxis.x;
-    view[2] = zaxis.x;
-    view[3] = 0.0f;
-    view[4] = xaxis.y;
-    view[5] = yaxis.y;
-    view[6] = zaxis.y;
-    view[7] = 0.0f;
-    view[8] = xaxis.z;
-    view[9] = yaxis.z;
-    view[10] = zaxis.z;
-    view[11] = 0;
-    view[12] = -vec3_dot(&xaxis, &camera->location);
-    view[13] = -vec3_dot(&yaxis, &camera->location);
-    view[14] = -vec3_dot(&zaxis, &camera->location);
-    view[15] = 1.0f;
+    // We tested if it would make sense to create a vec3_dot_sse version for the 3 dot products
+    // The result was that it is not faster, only if we would do 4 dot products would we see an improvement
+    camera->view[0] = xaxis.x;
+    camera->view[1] = yaxis.x;
+    camera->view[2] = zaxis.x;
+    camera->view[3] = 0.0f;
+    camera->view[4] = xaxis.y;
+    camera->view[5] = yaxis.y;
+    camera->view[6] = zaxis.y;
+    camera->view[7] = 0.0f;
+    camera->view[8] = xaxis.z;
+    camera->view[9] = yaxis.z;
+    camera->view[10] = zaxis.z;
+    camera->view[11] = 0;
+    camera->view[12] = -vec3_dot(&xaxis, &camera->location);
+    camera->view[13] = -vec3_dot(&yaxis, &camera->location);
+    camera->view[14] = -vec3_dot(&zaxis, &camera->location);
+    camera->view[15] = 1.0f;
 }
 
-// @performance This function might be optimizable with simd?
-//  the normalization might also be not required?
 void
-camera_view_matrix_rh(const Camera* __restrict camera, f32* __restrict view)
+camera_view_matrix_rh(Camera* __restrict camera)
 {
     v3_f32 zaxis = { -camera->front.x, -camera->front.y, -camera->front.z };
 
@@ -321,22 +327,24 @@ camera_view_matrix_rh(const Camera* __restrict camera, f32* __restrict view)
     v3_f32 yaxis;
     vec3_cross(&yaxis, &zaxis, &xaxis);
 
-    view[0] = xaxis.x;
-    view[1] = yaxis.x;
-    view[2] = zaxis.x;
-    view[3] = 0.0f;
-    view[4] = xaxis.y;
-    view[5] = yaxis.y;
-    view[6] = zaxis.y;
-    view[7] = 0.0f;
-    view[8] = xaxis.z;
-    view[9] = yaxis.z;
-    view[10] = zaxis.z;
-    view[11] = 0;
-    view[12] = -vec3_dot(&xaxis, &camera->location);
-    view[13] = -vec3_dot(&yaxis, &camera->location);
-    view[14] = -vec3_dot(&zaxis, &camera->location);
-    view[15] = 1.0f;
+    // We tested if it would make sense to create a vec3_dot_sse version for the 3 dot products
+    // The result was that it is not faster, only if we would do 4 dot products would we see an improvement
+    camera->view[0] = xaxis.x;
+    camera->view[1] = yaxis.x;
+    camera->view[2] = zaxis.x;
+    camera->view[3] = 0.0f;
+    camera->view[4] = xaxis.y;
+    camera->view[5] = yaxis.y;
+    camera->view[6] = zaxis.y;
+    camera->view[7] = 0.0f;
+    camera->view[8] = xaxis.z;
+    camera->view[9] = yaxis.z;
+    camera->view[10] = zaxis.z;
+    camera->view[11] = 0;
+    camera->view[12] = -vec3_dot(&xaxis, &camera->location);
+    camera->view[13] = -vec3_dot(&yaxis, &camera->location);
+    camera->view[14] = -vec3_dot(&zaxis, &camera->location);
+    camera->view[15] = 1.0f;
 }
 
 #endif
\ No newline at end of file
diff --git a/gpuapi/RenderUtils.h b/gpuapi/RenderUtils.h
index 02c7005..8a643cf 100644
--- a/gpuapi/RenderUtils.h
+++ b/gpuapi/RenderUtils.h
@@ -71,27 +71,27 @@ void vertex_line_create(
         y2 -= thickness / 2;
     }
 
-    float n1 = -(y2 - y1);
-    float n2 = x2 - x1;
-    float n_ = sqrtf(n2 * n2 + n1 * n1);
-    float norm1 = n1 / n_;
-    float norm2 = n2 / n_;
+    f32 n1 = -(y2 - y1);
+    f32 n2 = x2 - x1;
+    f32 n_ = sqrtf(n2 * n2 + n1 * n1);
+    f32 norm1 = n1 / n_;
+    f32 norm2 = n2 / n_;
 
     // @todo Currently we always use p1 and never p2
     //      This is wrong and depends on the Alignment, no? Maybe not
     // Calculate both parallel points to the start position
-    float p1_x1 = x1 + thickness * norm1;
-    float p1_y1 = y1 + thickness * norm2;
+    f32 p1_x1 = x1 + thickness * norm1;
+    f32 p1_y1 = y1 + thickness * norm2;
 
-    // float p2_x1 = x1 - thickness * norm1;
-    // float p2_y1 = y1 - thickness * norm2;
+    // f32 p2_x1 = x1 - thickness * norm1;
+    // f32 p2_y1 = y1 - thickness * norm2;
 
     // Calculate both parallel points to the end position
-    float p1_x2 = x2 + thickness * norm1;
-    float p1_y2 = y2 + thickness * norm2;
+    f32 p1_x2 = x2 + thickness * norm1;
+    f32 p1_y2 = y2 + thickness * norm2;
 
-    // float p2_x2 = x2 - thickness * norm1;
-    // float p2_y2 = y2 - thickness * norm2;
+    // f32 p2_x2 = x2 - thickness * norm1;
+    // f32 p2_y2 = y2 - thickness * norm2;
 
     vertex_degenerate_create(vertices, index, zindex, x1, y1);
 
@@ -148,6 +148,9 @@ void vertex_rect_create(
 
     vertex_degenerate_create(vertices, index, zindex, x, y);
 
+    f32 y_height = y + height;
+    f32 x_width = x + width;
+
     // Rectangle
     vertices[*index].position.x = x;
     vertices[*index].position.y = y;
@@ -158,14 +161,14 @@ void vertex_rect_create(
     ++(*index);
 
     vertices[*index].position.x = x;
-    vertices[*index].position.y = y + height;
+    vertices[*index].position.y = y_height;
     vertices[*index].position.z = zindex;
     vertices[*index].tex_coord.x = tex_x1;
     vertices[*index].tex_coord.y = tex_y2;
     vertices[*index].color = color_index;
     ++(*index);
 
-    vertices[*index].position.x = x + width;
+    vertices[*index].position.x = x_width;
     vertices[*index].position.y = y;
     vertices[*index].position.z = zindex;
     vertices[*index].tex_coord.x = tex_x2;
@@ -173,8 +176,8 @@ void vertex_rect_create(
     vertices[*index].color = color_index;
     ++(*index);
 
-    vertices[*index].position.x = x + width;
-    vertices[*index].position.y = y + height;
+    vertices[*index].position.x = x_width;
+    vertices[*index].position.y = y_height;
     vertices[*index].position.z = zindex;
     vertices[*index].tex_coord.x = tex_x2;
     vertices[*index].tex_coord.y = tex_y2;
@@ -205,6 +208,11 @@ void vertex_rect_border_create(
     // @bug While this works for the whole rectangle it doesn't work for individual borders
     // @todo We need a version where you can define individual borders
 
+    f32 y_height = y + height;
+    f32 y_thickness = y + thickness;
+    f32 x_width = x + width;
+    f32 x_thickness = x + thickness;
+
     // Rectangle
     // Top border
     vertices[*index].position.x = x;
@@ -216,14 +224,14 @@ void vertex_rect_border_create(
     ++(*index);
 
     vertices[*index].position.x = x;
-    vertices[*index].position.y = y + thickness;
+    vertices[*index].position.y = y_thickness;
     vertices[*index].position.z = zindex;
     vertices[*index].tex_coord.x = tex_x1;
     vertices[*index].tex_coord.y = tex_y2;
     vertices[*index].color = color_index;
     ++(*index);
 
-    vertices[*index].position.x = x + width;
+    vertices[*index].position.x = x_width;
     vertices[*index].position.y = y;
     vertices[*index].position.z = zindex;
     vertices[*index].tex_coord.x = tex_x2;
@@ -231,8 +239,8 @@ void vertex_rect_border_create(
     vertices[*index].color = color_index;
     ++(*index);
 
-    vertices[*index].position.x = x + width;
-    vertices[*index].position.y = y + thickness;
+    vertices[*index].position.x = x_width;
+    vertices[*index].position.y = y_thickness;
     vertices[*index].position.z = zindex;
     vertices[*index].tex_coord.x = tex_x2;
     vertices[*index].tex_coord.y = tex_y2;
@@ -240,24 +248,24 @@ void vertex_rect_border_create(
     ++(*index);
 
     // Right border
-    vertices[*index].position.x = x + width - thickness;
-    vertices[*index].position.y = y + thickness;
+    vertices[*index].position.x = x_width - thickness;
+    vertices[*index].position.y = y_thickness;
     vertices[*index].position.z = zindex;
     vertices[*index].tex_coord.x = tex_x2;
     vertices[*index].tex_coord.y = tex_y2;
     vertices[*index].color = color_index;
     ++(*index);
 
-    vertices[*index].position.x = x + width;
-    vertices[*index].position.y = y + height;
+    vertices[*index].position.x = x_width;
+    vertices[*index].position.y = y_height;
     vertices[*index].position.z = zindex;
     vertices[*index].tex_coord.x = tex_x1;
     vertices[*index].tex_coord.y = tex_y2;
     vertices[*index].color = color_index;
     ++(*index);
 
-    vertices[*index].position.x = x + width - thickness;
-    vertices[*index].position.y = y + height;
+    vertices[*index].position.x = x_width - thickness;
+    vertices[*index].position.y = y_height;
     vertices[*index].position.z = zindex;
     vertices[*index].tex_coord.x = tex_x2;
     vertices[*index].tex_coord.y = tex_y1;
@@ -265,8 +273,8 @@ void vertex_rect_border_create(
     ++(*index);
 
     // Bottom border
-    vertices[*index].position.x = x + width - thickness;
-    vertices[*index].position.y = y + height - thickness;
+    vertices[*index].position.x = x_width - thickness;
+    vertices[*index].position.y = y_height - thickness;
     vertices[*index].position.z = zindex;
     vertices[*index].tex_coord.x = tex_x2;
     vertices[*index].tex_coord.y = tex_y2;
@@ -274,7 +282,7 @@ void vertex_rect_border_create(
     ++(*index);
 
     vertices[*index].position.x = x;
-    vertices[*index].position.y = y + height;
+    vertices[*index].position.y = y_height;
     vertices[*index].position.z = zindex;
     vertices[*index].tex_coord.x = tex_x1;
     vertices[*index].tex_coord.y = tex_y2;
@@ -282,7 +290,7 @@ void vertex_rect_border_create(
     ++(*index);
 
     vertices[*index].position.x = x;
-    vertices[*index].position.y = y + height - thickness;
+    vertices[*index].position.y = y_height - thickness;
     vertices[*index].position.z = zindex;
     vertices[*index].tex_coord.x = tex_x2;
     vertices[*index].tex_coord.y = tex_y1;
@@ -290,8 +298,8 @@ void vertex_rect_border_create(
     ++(*index);
 
     // Left border
-    vertices[*index].position.x = x + thickness;
-    vertices[*index].position.y = y + height - thickness;
+    vertices[*index].position.x = x_thickness;
+    vertices[*index].position.y = y_height - thickness;
     vertices[*index].position.z = zindex;
     vertices[*index].tex_coord.x = tex_x2;
     vertices[*index].tex_coord.y = tex_y2;
@@ -299,15 +307,15 @@ void vertex_rect_border_create(
     ++(*index);
 
     vertices[*index].position.x = x;
-    vertices[*index].position.y = y + thickness;
+    vertices[*index].position.y = y_thickness;
     vertices[*index].position.z = zindex;
     vertices[*index].tex_coord.x = tex_x1;
     vertices[*index].tex_coord.y = tex_y2;
     vertices[*index].color = color_index;
     ++(*index);
 
-    vertices[*index].position.x = x + thickness;
-    vertices[*index].position.y = y + thickness;
+    vertices[*index].position.x = x_thickness;
+    vertices[*index].position.y = y_thickness;
     vertices[*index].position.z = zindex;
     vertices[*index].tex_coord.x = tex_x2;
     vertices[*index].tex_coord.y = tex_y1;
@@ -338,13 +346,14 @@ f32 text_calculate_dimensions_height(
     f32 height,
     const Font* __restrict font, const char* __restrict text, f32 scale, int32 length
 ) {
-    f32 y = font->line_height * scale;
+    f32 line_height = font->line_height * scale;
+    f32 y = line_height;
 
     // @todo remember to restrict to width/height if value > 0 -> force width to remain below certain value
 
-    for (int i = 0; i < length; ++i) {
+    for (int32 i = 0; i < length; ++i) {
         if (text[i] == '\n') {
-            y += font->line_height * scale;
+            y += line_height;
         }
     }
 
@@ -363,7 +372,7 @@ f32 text_calculate_dimensions_width(
 
     // @todo remember to restrict to width/height if value > 0 -> force width to remain below certain value
 
-    for (int i = 0; i < length; ++i) {
+    for (int32 i = 0; i < length; ++i) {
         int32 character = is_ascii ? text[i] : utf8_get_char_at(text, i);
 
         if (character == '\n') {
@@ -376,13 +385,14 @@ f32 text_calculate_dimensions_width(
         Glyph* glyph = NULL;
         // We try to jump t othe correct glyph based on the glyph codepoint
         // If that doesn't work we iterate the glyph list BUT only until the last possible match (glyphs must be sorted ascending)
-        if (font->glyph_count > character - first_glyph
-            && font->glyphs[character - first_glyph].codepoint == character
+        int32 perfect_glyph_pos = character - first_glyph;
+        if (font->glyph_count > perfect_glyph_pos
+            && font->glyphs[perfect_glyph_pos].codepoint == character
         ) {
-            glyph = &font->glyphs[character - first_glyph];
+            glyph = &font->glyphs[perfect_glyph_pos];
         } else {
             // @performance consider to do binary search
-            for (int j = 0; j <= character - first_glyph && j < font->glyph_count; ++j) {
+            for (int32 j = 0; j <= perfect_glyph_pos && j < font->glyph_count; ++j) {
                 if (font->glyphs[j].codepoint == character) {
                     glyph = &font->glyphs[j];
 
@@ -406,8 +416,9 @@ void text_calculate_dimensions(
     f32* __restrict width, f32* __restrict height,
     const Font* __restrict font, const char* __restrict text, bool is_ascii, f32 scale, int32 length
 ) {
+    f32 line_height = font->line_height * scale;
     f32 x = 0;
-    f32 y = font->line_height * scale;
+    f32 y = line_height;
 
     f32 offset_x = 0;
 
@@ -415,12 +426,12 @@ void text_calculate_dimensions(
 
     // @todo remember to restrict to width/height if value > 0 -> force width to remain below certain value
 
-    for (int i = 0; i < length; ++i) {
+    for (int32 i = 0; i < length; ++i) {
         int32 character = is_ascii ? text[i] : utf8_get_char_at(text, i);
 
         if (character == '\n') {
             x = OMS_MAX(x, offset_x);
-            y += font->line_height * scale;
+            y += line_height;
 
             offset_x = 0;
 
@@ -430,13 +441,14 @@ void text_calculate_dimensions(
         Glyph* glyph = NULL;
         // We try to jump t othe correct glyph based on the glyph codepoint
         // If that doesn't work we iterate the glyph list BUT only until the last possible match (glyphs must be sorted ascending)
-        if (font->glyph_count > character - first_glyph
-            && font->glyphs[character - first_glyph].codepoint == character
+        int32 perfect_glyph_pos = character - first_glyph;
+        if (font->glyph_count > perfect_glyph_pos
+            && font->glyphs[perfect_glyph_pos].codepoint == character
         ) {
-            glyph = &font->glyphs[character - first_glyph];
+            glyph = &font->glyphs[perfect_glyph_pos];
         } else {
             // @performance consider to do binary search
-            for (int j = 0; j <= character - first_glyph && j < font->glyph_count; ++j) {
+            for (int32 j = 0; j <= perfect_glyph_pos && j < font->glyph_count; ++j) {
                 if (font->glyphs[j].codepoint == character) {
                     glyph = &font->glyphs[j];
 
@@ -465,7 +477,7 @@ f32 vertex_text_create(
 ) {
     int32 length = utf8_strlen(text);
     bool is_ascii = strlen(text) == length;
-    float scale = size / font->size;
+    f32 scale = size / font->size;
 
     // If we do a different alignment we need to pre-calculate the width and height
     if (align_h != 0 || align_v != 0) {
@@ -493,7 +505,7 @@ f32 vertex_text_create(
     uint32 first_glyph = font->glyphs[0].codepoint;
 
     f32 offset_x = x;
-    for (int i = 0; i < length; ++i) {
+    for (int32 i = 0; i < length; ++i) {
         int32 character = is_ascii ? text[i] : utf8_get_char_at(text, i);
         if (character == '\n') {
             y += font->line_height * scale;
@@ -505,13 +517,14 @@ f32 vertex_text_create(
         Glyph* glyph = NULL;
         // We try to jump t othe correct glyph based on the glyph codepoint
         // If that doesn't work we iterate the glyph list BUT only until the last possible match (glyphs must be sorted ascending)
-        if (font->glyph_count > character - first_glyph
-            && font->glyphs[character - first_glyph].codepoint == character
+        int32 perfect_glyph_pos = character - first_glyph;
+        if (font->glyph_count > perfect_glyph_pos
+            && font->glyphs[perfect_glyph_pos].codepoint == character
         ) {
-            glyph = &font->glyphs[character - first_glyph];
+            glyph = &font->glyphs[perfect_glyph_pos];
         } else {
             // @performance consider to do binary search
-            for (int j = 0; j <= character - first_glyph && j < font->glyph_count; ++j) {
+            for (int32 j = 0; j <= perfect_glyph_pos && j < font->glyph_count; ++j) {
                 if (font->glyphs[j].codepoint == character) {
                     glyph = &font->glyphs[j];
 
@@ -602,7 +615,7 @@ f32 ui_text_create(
 
     int32 length = utf8_strlen(text->value_str);
     bool is_ascii = strlen(text->value_str) == length;
-    float scale = size->value_float / theme->font.size;
+    f32 scale = size->value_float / theme->font.size;
 
     // If we do a different alignment we need to pre-calculate the width and height
     if (align_h != NULL || align_v != NULL) {
@@ -635,7 +648,7 @@ f32 ui_text_create(
     int32 start = *index;
     f32 offset_x = x->value_int;
     f32 offset_y = y->value_int;
-    for (int i = 0; i < length; ++i) {
+    for (int32 i = 0; i < length; ++i) {
         int32 character = is_ascii ? text->value_str[i] : utf8_get_char_at(text->value_str, i);
 
         if (character == '\n') {
@@ -648,13 +661,14 @@ f32 ui_text_create(
         Glyph* glyph = NULL;
         // We try to jump t othe correct glyph based on the glyph codepoint
         // If that doesn't work we iterate the glyph list BUT only until the last possible match (glyphs must be sorted ascending)
-        if (theme->font.glyph_count > character - first_glyph
-            && theme->font.glyphs[character - first_glyph].codepoint == character
+        int32 perfect_glyph_pos = character - first_glyph;
+        if (theme->font.glyph_count > perfect_glyph_pos
+            && theme->font.glyphs[perfect_glyph_pos].codepoint == character
         ) {
-            glyph = &theme->font.glyphs[character - first_glyph];
+            glyph = &theme->font.glyphs[perfect_glyph_pos];
         } else {
             // @performance consider to do binary search
-            for (int j = 0; j <= character - first_glyph && j < theme->font.glyph_count; ++j) {
+            for (int32 j = 0; j <= perfect_glyph_pos && j < theme->font.glyph_count; ++j) {
                 if (theme->font.glyphs[j].codepoint == character) {
                     glyph = &theme->font.glyphs[j];
 
diff --git a/input/Input.h b/input/Input.h
index 030925e..7bb66a7 100644
--- a/input/Input.h
+++ b/input/Input.h
@@ -533,16 +533,18 @@ void
 input_hotkey_state(Input* input)
 {
     uint8 old_hotkeys[MAX_KEY_PRESSES];
-    memcpy(old_hotkeys, input->state.state_hotkeys, sizeof(uint8) * MAX_KEY_PRESSES);
+    InputState* state = &input->state;
 
-    memset(input->state.state_hotkeys, 0, sizeof(uint8) * MAX_KEY_PRESSES);
+    memcpy(old_hotkeys, state->state_hotkeys, sizeof(uint8) * MAX_KEY_PRESSES);
+
+    memset(state->state_hotkeys, 0, sizeof(uint8) * MAX_KEY_PRESSES);
 
     int32 active_hotkeys = 0;
 
     // Check every key down state
     for (int key_state = 0; key_state < MAX_KEY_STATES; ++key_state) {
-        if (input->state.state_keys[key_state].key_id == 0
-            || input->state.state_keys[key_state].key_state == KEY_STATE_RELEASED
+        if (state->state_keys[key_state].key_id == 0
+            || state->state_keys[key_state].key_state == KEY_STATE_RELEASED
         ) {
             // no key defined for this down state
             continue;
@@ -551,7 +553,7 @@ input_hotkey_state(Input* input)
         // Is a key defined for this state AND is at least one hotkey defined for this key
         //      If no hotkey is defined we don't care
         //      Careful, remember MAX_MOUSE_KEYS offset
-        InputKey* key = &input->state.state_keys[key_state];
+        InputKey* key = &state->state_keys[key_state];
         int32 internal_key_id = (key->key_id & ~(INPUT_KEYBOARD_PREFIX | INPUT_CONTROLLER_PREFIX))
             + ((bool) (key->key_id & INPUT_KEYBOARD_PREFIX)) * MAX_MOUSE_KEYS
             + ((bool) (key->key_id & INPUT_CONTROLLER_PREFIX)) * (MAX_MOUSE_KEYS + MAX_KEYBOARD_KEYS);
@@ -589,17 +591,17 @@ input_hotkey_state(Input* input)
 
                 // Hotkey already active
                 // @question Do we even need this? This shouldn't happen anyway?!
-                if (hotkey_is_active(input->state.state_hotkeys, hotkeys_for_key[possible_hotkey_idx])) {
+                if (hotkey_is_active(state->state_hotkeys, hotkeys_for_key[possible_hotkey_idx])) {
                     continue;
                 }
 
                 // store active hotkey, if it is not already active
-                bool is_pressed = hotkey_keys_are_active(input->state.state_keys, mapping, hotkeys_for_key[possible_hotkey_idx]);
+                bool is_pressed = hotkey_keys_are_active(state->state_keys, mapping, hotkeys_for_key[possible_hotkey_idx]);
                 if (!is_pressed) {
                     continue;
                 }
 
-                input->state.state_hotkeys[active_hotkeys] = hotkeys_for_key[possible_hotkey_idx];
+                state->state_hotkeys[active_hotkeys] = hotkeys_for_key[possible_hotkey_idx];
                 ++active_hotkeys;
 
                 // Run callback if defined
diff --git a/log/Debug.cpp b/log/Debug.cpp
index f8d5ca1..af4f191 100644
--- a/log/Debug.cpp
+++ b/log/Debug.cpp
@@ -84,18 +84,17 @@ void update_timing_stat(uint32 stat, const char* function)
 {
     uint64 new_tick_count = __rdtsc();
 
-    debug_container->perf_stats[stat].function = function;
-    debug_container->perf_stats[stat].delta_tick = new_tick_count - debug_container->perf_stats[stat].old_tick_count;
-    debug_container->perf_stats[stat].delta_time = (double) debug_container->perf_stats[stat].delta_tick / (double) debug_container->performance_count_frequency;
-    debug_container->perf_stats[stat].old_tick_count = new_tick_count;
+    TimingStat* timing_stat = &debug_container->perf_stats[stat];
+    timing_stat->function = function;
+    timing_stat->delta_tick = new_tick_count - timing_stat->old_tick_count;
+    timing_stat->delta_time = (double) timing_stat->delta_tick / (double) debug_container->performance_count_frequency;
+    timing_stat->old_tick_count = new_tick_count;
 }
 
 inline
 void update_timing_stat_start(uint32 stat, const char*)
 {
-    uint64 new_tick_count = __rdtsc();
-
-    debug_container->perf_stats[stat].old_tick_count = new_tick_count;
+    debug_container->perf_stats[stat].old_tick_count = __rdtsc();
 }
 
 inline
@@ -103,10 +102,11 @@ void update_timing_stat_end(uint32 stat, const char* function)
 {
     uint64 new_tick_count = __rdtsc();
 
-    debug_container->perf_stats[stat].function = function;
-    debug_container->perf_stats[stat].delta_tick = new_tick_count - debug_container->perf_stats[stat].old_tick_count;
-    debug_container->perf_stats[stat].delta_time = (double) debug_container->perf_stats[stat].delta_tick / (double) debug_container->performance_count_frequency;
-    debug_container->perf_stats[stat].old_tick_count = new_tick_count;
+    TimingStat* timing_stat = &debug_container->perf_stats[stat];
+    timing_stat->function = function;
+    timing_stat->delta_tick = new_tick_count - timing_stat->old_tick_count;
+    timing_stat->delta_time = (double) timing_stat->delta_tick / (double) debug_container->performance_count_frequency;
+    timing_stat->old_tick_count = new_tick_count;
 }
 
 inline
@@ -114,12 +114,11 @@ void update_timing_stat_end_continued(uint32 stat, const char* function)
 {
     uint64 new_tick_count = __rdtsc();
 
-    debug_container->perf_stats[stat].function = function;
-    debug_container->perf_stats[stat].delta_tick = debug_container->perf_stats[stat].delta_tick
-        + new_tick_count - debug_container->perf_stats[stat].old_tick_count;
-    debug_container->perf_stats[stat].delta_time = debug_container->perf_stats[stat].delta_time
-        + (double) debug_container->perf_stats[stat].delta_tick / (double) debug_container->performance_count_frequency;
-    debug_container->perf_stats[stat].old_tick_count = new_tick_count;
+    TimingStat* timing_stat = &debug_container->perf_stats[stat];
+    timing_stat->function = function;
+    timing_stat->delta_tick = timing_stat->delta_tick + new_tick_count - timing_stat->old_tick_count;
+    timing_stat->delta_time = timing_stat->delta_time + (double) timing_stat->delta_tick / (double) debug_container->performance_count_frequency;
+    timing_stat->old_tick_count = new_tick_count;
 }
 
 inline
@@ -172,23 +171,25 @@ void debug_memory_init(uint64 start, uint64 size)
         return;
     }
 
-    if (debug_container->dmc.memory_size <= debug_container->dmc.memory_element_idx) {
-        DebugMemory* old = debug_container->dmc.memory_stats;
+    DebugMemoryContainer* dmc = &debug_container->dmc;
+    if (dmc->memory_size <= dmc->memory_element_idx) {
+        DebugMemory* old = dmc->memory_stats;
 
-        debug_container->dmc.memory_size += 3;
-        debug_container->dmc.memory_stats = (DebugMemory *) calloc(debug_container->dmc.memory_size, sizeof(DebugMemory));
+        dmc->memory_size += 3;
+        dmc->memory_stats = (DebugMemory *) calloc(dmc->memory_size, sizeof(DebugMemory));
 
         if (old) {
-            memcpy(debug_container->dmc.memory_stats, old, (debug_container->dmc.memory_size - 3) * sizeof(DebugMemory));
+            memcpy(dmc->memory_stats, old, (dmc->memory_size - 3) * sizeof(DebugMemory));
             free(old);
         }
     }
 
-    debug_container->dmc.memory_stats[debug_container->dmc.memory_element_idx].start = start;
-    debug_container->dmc.memory_stats[debug_container->dmc.memory_element_idx].size = size;
-    debug_container->dmc.memory_stats[debug_container->dmc.memory_element_idx].usage = 0;
+    DebugMemory* debug_mem = &dmc->memory_stats[dmc->memory_element_idx];
+    debug_mem->start = start;
+    debug_mem->size = size;
+    debug_mem->usage = 0;
 
-    ++debug_container->dmc.memory_element_idx;
+    ++dmc->memory_element_idx;
 }
 
 void debug_memory_log(uint64 start, uint64 size, int32 type, const char* function)
@@ -206,13 +207,14 @@ void debug_memory_log(uint64 start, uint64 size, int32 type, const char* functio
         mem->action_idx = 0;
     }
 
-    mem->last_action[mem->action_idx].type = type;
-    mem->last_action[mem->action_idx].start = start - mem->start;
-    mem->last_action[mem->action_idx].size = size;
+    DebugMemoryRange* dmr = &mem->last_action[mem->action_idx];
+    dmr->type = type;
+    dmr->start = start - mem->start;
+    dmr->size = size;
 
     // We are using rdtsc since it is faster -> less debugging overhead than using time()
-    mem->last_action[mem->action_idx].time = __rdtsc();
-    mem->last_action[mem->action_idx].function_name = function;
+    dmr->time = __rdtsc();
+    dmr->function_name = function;
 
     ++mem->action_idx;
 
@@ -238,13 +240,14 @@ void debug_memory_reserve(uint64 start, uint64 size, int32 type, const char* fun
         mem->reserve_action_idx = 0;
     }
 
-    mem->reserve_action[mem->reserve_action_idx].type = type;
-    mem->reserve_action[mem->reserve_action_idx].start = start - mem->start;
-    mem->reserve_action[mem->reserve_action_idx].size = size;
+    DebugMemoryRange* dmr = &mem->reserve_action[mem->reserve_action_idx];
+    dmr->type = type;
+    dmr->start = start - mem->start;
+    dmr->size = size;
 
     // We are using rdtsc since it is faster -> less debugging overhead than using time()
-    mem->reserve_action[mem->reserve_action_idx].time = __rdtsc();
-    mem->reserve_action[mem->reserve_action_idx].function_name = function;
+    dmr->time = __rdtsc();
+    dmr->function_name = function;
 
     ++mem->reserve_action_idx;
 }
@@ -273,29 +276,30 @@ byte* log_get_memory(uint64 size, byte aligned = 1, bool zeroed = false)
         return 0;
     }
 
-    ASSERT_SIMPLE(size <= debug_container->log_memory.size);
+    LogMemory* log_mem = &debug_container->log_memory;
+    ASSERT_SIMPLE(size <= log_mem->size);
 
     if (aligned > 1) {
-        uintptr_t address = (uintptr_t) debug_container->log_memory.memory;
-        debug_container->log_memory.pos += (aligned - ((address + debug_container->log_memory.pos) & (aligned - 1))) % aligned;
+        uintptr_t address = (uintptr_t) log_mem->memory;
+        log_mem->pos += (aligned - ((address + log_mem->pos) & (aligned - 1))) % aligned;
     }
 
     size = ROUND_TO_NEAREST(size, aligned);
-    if (debug_container->log_memory.pos + size > debug_container->log_memory.size) {
-        debug_container->log_memory.pos = 0;
+    if (log_mem->pos + size > log_mem->size) {
+        log_mem->pos = 0;
 
         if (aligned > 1) {
-            uintptr_t address = (uintptr_t) debug_container->log_memory.memory;
-            debug_container->log_memory.pos += (aligned - ((address + debug_container->log_memory.pos) & (aligned - 1))) % aligned;
+            uintptr_t address = (uintptr_t) log_mem->memory;
+            log_mem->pos += (aligned - ((address + log_mem->pos) & (aligned - 1))) % aligned;
         }
     }
 
-    byte* offset = (byte *) (debug_container->log_memory.memory + debug_container->log_memory.pos);
+    byte* offset = (byte *) (log_mem->memory + log_mem->pos);
     if (zeroed) {
         memset((void *) offset, 0, size);
     }
 
-    debug_container->log_memory.pos += size;
+    log_mem->pos += size;
 
     return offset;
 }
diff --git a/math/matrix/MatrixFloat32.h b/math/matrix/MatrixFloat32.h
index 1f3a6b5..12a8fe6 100644
--- a/math/matrix/MatrixFloat32.h
+++ b/math/matrix/MatrixFloat32.h
@@ -23,6 +23,7 @@
 
 // @todo Implement intrinsic versions!
 
+inline
 void vec2_normalize(f32* __restrict x, f32* __restrict y)
 {
     f32 d = sqrtf((*x) * (*x) + (*y) * (*y));
@@ -94,6 +95,7 @@ f32 vec2_dot(const v2_f32* a, const v2_f32* b) {
     return a->x * b->x + a->y * b->y;
 }
 
+inline
 void vec3_normalize(f32* __restrict x, f32* __restrict y, f32* __restrict z)
 {
     f32 d = sqrtf((*x) * (*x) + (*y) * (*y) + (*z) * (*z));
@@ -103,6 +105,7 @@ void vec3_normalize(f32* __restrict x, f32* __restrict y, f32* __restrict z)
     *z /= d;
 }
 
+inline
 void vec3_normalize(v3_f32* vec)
 {
     f32 d = sqrtf(vec->x * vec->x + vec->y * vec->y + vec->z * vec->z);
@@ -179,6 +182,7 @@ void vec3_cross(v3_f32* __restrict vec, const v3_f32* a, const v3_f32* b) {
     vec->z = a->x * b->y - a->y * b->x;
 }
 
+inline
 f32 vec3_dot(const v3_f32* a, const v3_f32* b) {
     return a->x * b->x + a->y * b->y + a->z * b->z;
 }