minor code optimizations

2026-01-11 19:28:40 +00:00 · 2024-11-22 10:25:47 +01:00 · 2024-11-22 10:25:47 +01:00 · 333c7d6e53
commit 333c7d6e53
parent ab9ab3fceb
5 changed files with 190 additions and 158 deletions
--- a/camera/Camera.h
+++ b/camera/Camera.h
@ -20,6 +20,8 @@
 // @todo Please check out if we can switch to quaternions. We tried but failed.

 struct Camera {
+    bool is_changed;
+
    v3_f32 location;
    v4_f32 orientation;

@ -39,6 +41,8 @@ struct Camera {
    f32 znear;
    f32 zfar;
    f32 aspect;
+
+    f32 view[16];
 };

 void
@ -48,17 +52,19 @@ camera_update_vectors(Camera* camera)
    camera->front.x = cos_ori_x * cosf(OMS_DEG2RAD(camera->orientation.y));
    camera->front.y = sinf(OMS_DEG2RAD(camera->orientation.x));
    camera->front.z = cos_ori_x * sinf(OMS_DEG2RAD(camera->orientation.y));
-    vec3_normalize(&camera->front);

    vec3_cross(&camera->right, &camera->front, &camera->world_up);
-    vec3_normalize(&camera->right);
-
    vec3_cross(&camera->up, &camera->right, &camera->front);
+
+    // We checked if combining these 3 into a single SIMD function, but it was slower
+    vec3_normalize(&camera->right);
+    vec3_normalize(&camera->front);
    vec3_normalize(&camera->up);
 }

 void camera_rotate(Camera* camera, int32 dx, int32 dy, f32 dt)
 {
+    camera->is_changed = true;
    camera->orientation.x += dy * camera->sensitivity;
    camera->orientation.y -= dx * camera->sensitivity;

@ -82,6 +88,7 @@ void camera_rotate(Camera* camera, int32 dx, int32 dy, f32 dt)
 // you can have up to 4 camera movement inputs at the same time
 void camera_movement(Camera* camera, CameraMovement* movement, f32 dt, bool relative_to_world = true)
 {
+    camera->is_changed = true;
    f32 velocity = camera->speed * dt;

    if (relative_to_world) {
@ -137,10 +144,11 @@ void camera_movement(Camera* camera, CameraMovement* movement, f32 dt, bool rela

        v3_f32 right;
        vec3_cross(&right, &camera->world_up, &forward);
-        vec3_normalize(&right);

        v3_f32 up;
        vec3_cross(&up, &right, &forward);
+
+        vec3_normalize(&right);
        vec3_normalize(&up);

        for (int32 i = 0; i < CAMERA_MAX_INPUTS; i++) {
@ -275,10 +283,8 @@ void camera_translation_matrix_sparse_lh(const Camera* __restrict camera, f32* t
    translation[11] = camera->location.z;
 }

-// @performance This function might be optimizable with simd?
-//  the normalization might also be not required?
 void
-camera_view_matrix_lh(const Camera* __restrict camera, f32* __restrict view)
+camera_view_matrix_lh(Camera* __restrict camera)
 {
    v3_f32 zaxis = { camera->front.x, camera->front.y, camera->front.z };

@ -289,28 +295,28 @@ camera_view_matrix_lh(const Camera* __restrict camera, f32* __restrict view)
    v3_f32 yaxis;
    vec3_cross(&yaxis, &zaxis, &xaxis);

-    view[0] = xaxis.x;
-    view[1] = yaxis.x;
-    view[2] = zaxis.x;
-    view[3] = 0.0f;
-    view[4] = xaxis.y;
-    view[5] = yaxis.y;
-    view[6] = zaxis.y;
-    view[7] = 0.0f;
-    view[8] = xaxis.z;
-    view[9] = yaxis.z;
-    view[10] = zaxis.z;
-    view[11] = 0;
-    view[12] = -vec3_dot(&xaxis, &camera->location);
-    view[13] = -vec3_dot(&yaxis, &camera->location);
-    view[14] = -vec3_dot(&zaxis, &camera->location);
-    view[15] = 1.0f;
+    // We tested if it would make sense to create a vec3_dot_sse version for the 3 dot products
+    // The result was that it is not faster, only if we would do 4 dot products would we see an improvement
+    camera->view[0] = xaxis.x;
+    camera->view[1] = yaxis.x;
+    camera->view[2] = zaxis.x;
+    camera->view[3] = 0.0f;
+    camera->view[4] = xaxis.y;
+    camera->view[5] = yaxis.y;
+    camera->view[6] = zaxis.y;
+    camera->view[7] = 0.0f;
+    camera->view[8] = xaxis.z;
+    camera->view[9] = yaxis.z;
+    camera->view[10] = zaxis.z;
+    camera->view[11] = 0;
+    camera->view[12] = -vec3_dot(&xaxis, &camera->location);
+    camera->view[13] = -vec3_dot(&yaxis, &camera->location);
+    camera->view[14] = -vec3_dot(&zaxis, &camera->location);
+    camera->view[15] = 1.0f;
 }

-// @performance This function might be optimizable with simd?
-//  the normalization might also be not required?
 void
-camera_view_matrix_rh(const Camera* __restrict camera, f32* __restrict view)
+camera_view_matrix_rh(Camera* __restrict camera)
 {
    v3_f32 zaxis = { -camera->front.x, -camera->front.y, -camera->front.z };

@ -321,22 +327,24 @@ camera_view_matrix_rh(const Camera* __restrict camera, f32* __restrict view)
    v3_f32 yaxis;
    vec3_cross(&yaxis, &zaxis, &xaxis);

-    view[0] = xaxis.x;
-    view[1] = yaxis.x;
-    view[2] = zaxis.x;
-    view[3] = 0.0f;
-    view[4] = xaxis.y;
-    view[5] = yaxis.y;
-    view[6] = zaxis.y;
-    view[7] = 0.0f;
-    view[8] = xaxis.z;
-    view[9] = yaxis.z;
-    view[10] = zaxis.z;
-    view[11] = 0;
-    view[12] = -vec3_dot(&xaxis, &camera->location);
-    view[13] = -vec3_dot(&yaxis, &camera->location);
-    view[14] = -vec3_dot(&zaxis, &camera->location);
-    view[15] = 1.0f;
+    // We tested if it would make sense to create a vec3_dot_sse version for the 3 dot products
+    // The result was that it is not faster, only if we would do 4 dot products would we see an improvement
+    camera->view[0] = xaxis.x;
+    camera->view[1] = yaxis.x;
+    camera->view[2] = zaxis.x;
+    camera->view[3] = 0.0f;
+    camera->view[4] = xaxis.y;
+    camera->view[5] = yaxis.y;
+    camera->view[6] = zaxis.y;
+    camera->view[7] = 0.0f;
+    camera->view[8] = xaxis.z;
+    camera->view[9] = yaxis.z;
+    camera->view[10] = zaxis.z;
+    camera->view[11] = 0;
+    camera->view[12] = -vec3_dot(&xaxis, &camera->location);
+    camera->view[13] = -vec3_dot(&yaxis, &camera->location);
+    camera->view[14] = -vec3_dot(&zaxis, &camera->location);
+    camera->view[15] = 1.0f;
 }

 #endif
--- a/gpuapi/RenderUtils.h
+++ b/gpuapi/RenderUtils.h
@ -71,27 +71,27 @@ void vertex_line_create(
        y2 -= thickness / 2;
    }

-    float n1 = -(y2 - y1);
-    float n2 = x2 - x1;
-    float n_ = sqrtf(n2 * n2 + n1 * n1);
-    float norm1 = n1 / n_;
-    float norm2 = n2 / n_;
+    f32 n1 = -(y2 - y1);
+    f32 n2 = x2 - x1;
+    f32 n_ = sqrtf(n2 * n2 + n1 * n1);
+    f32 norm1 = n1 / n_;
+    f32 norm2 = n2 / n_;

    // @todo Currently we always use p1 and never p2
    //      This is wrong and depends on the Alignment, no? Maybe not
    // Calculate both parallel points to the start position
-    float p1_x1 = x1 + thickness * norm1;
-    float p1_y1 = y1 + thickness * norm2;
+    f32 p1_x1 = x1 + thickness * norm1;
+    f32 p1_y1 = y1 + thickness * norm2;

-    // float p2_x1 = x1 - thickness * norm1;
-    // float p2_y1 = y1 - thickness * norm2;
+    // f32 p2_x1 = x1 - thickness * norm1;
+    // f32 p2_y1 = y1 - thickness * norm2;

    // Calculate both parallel points to the end position
-    float p1_x2 = x2 + thickness * norm1;
-    float p1_y2 = y2 + thickness * norm2;
+    f32 p1_x2 = x2 + thickness * norm1;
+    f32 p1_y2 = y2 + thickness * norm2;

-    // float p2_x2 = x2 - thickness * norm1;
-    // float p2_y2 = y2 - thickness * norm2;
+    // f32 p2_x2 = x2 - thickness * norm1;
+    // f32 p2_y2 = y2 - thickness * norm2;

    vertex_degenerate_create(vertices, index, zindex, x1, y1);

@ -148,6 +148,9 @@ void vertex_rect_create(

    vertex_degenerate_create(vertices, index, zindex, x, y);

+    f32 y_height = y + height;
+    f32 x_width = x + width;
+
    // Rectangle
    vertices[*index].position.x = x;
    vertices[*index].position.y = y;
@ -158,14 +161,14 @@ void vertex_rect_create(
    ++(*index);

    vertices[*index].position.x = x;
-    vertices[*index].position.y = y + height;
+    vertices[*index].position.y = y_height;
    vertices[*index].position.z = zindex;
    vertices[*index].tex_coord.x = tex_x1;
    vertices[*index].tex_coord.y = tex_y2;
    vertices[*index].color = color_index;
    ++(*index);

-    vertices[*index].position.x = x + width;
+    vertices[*index].position.x = x_width;
    vertices[*index].position.y = y;
    vertices[*index].position.z = zindex;
    vertices[*index].tex_coord.x = tex_x2;
@ -173,8 +176,8 @@ void vertex_rect_create(
    vertices[*index].color = color_index;
    ++(*index);

-    vertices[*index].position.x = x + width;
-    vertices[*index].position.y = y + height;
+    vertices[*index].position.x = x_width;
+    vertices[*index].position.y = y_height;
    vertices[*index].position.z = zindex;
    vertices[*index].tex_coord.x = tex_x2;
    vertices[*index].tex_coord.y = tex_y2;
@ -205,6 +208,11 @@ void vertex_rect_border_create(
    // @bug While this works for the whole rectangle it doesn't work for individual borders
    // @todo We need a version where you can define individual borders

+    f32 y_height = y + height;
+    f32 y_thickness = y + thickness;
+    f32 x_width = x + width;
+    f32 x_thickness = x + thickness;
+
    // Rectangle
    // Top border
    vertices[*index].position.x = x;
@ -216,14 +224,14 @@ void vertex_rect_border_create(
    ++(*index);

    vertices[*index].position.x = x;
-    vertices[*index].position.y = y + thickness;
+    vertices[*index].position.y = y_thickness;
    vertices[*index].position.z = zindex;
    vertices[*index].tex_coord.x = tex_x1;
    vertices[*index].tex_coord.y = tex_y2;
    vertices[*index].color = color_index;
    ++(*index);

-    vertices[*index].position.x = x + width;
+    vertices[*index].position.x = x_width;
    vertices[*index].position.y = y;
    vertices[*index].position.z = zindex;
    vertices[*index].tex_coord.x = tex_x2;
@ -231,8 +239,8 @@ void vertex_rect_border_create(
    vertices[*index].color = color_index;
    ++(*index);

-    vertices[*index].position.x = x + width;
-    vertices[*index].position.y = y + thickness;
+    vertices[*index].position.x = x_width;
+    vertices[*index].position.y = y_thickness;
    vertices[*index].position.z = zindex;
    vertices[*index].tex_coord.x = tex_x2;
    vertices[*index].tex_coord.y = tex_y2;
@ -240,24 +248,24 @@ void vertex_rect_border_create(
    ++(*index);

    // Right border
-    vertices[*index].position.x = x + width - thickness;
-    vertices[*index].position.y = y + thickness;
+    vertices[*index].position.x = x_width - thickness;
+    vertices[*index].position.y = y_thickness;
    vertices[*index].position.z = zindex;
    vertices[*index].tex_coord.x = tex_x2;
    vertices[*index].tex_coord.y = tex_y2;
    vertices[*index].color = color_index;
    ++(*index);

-    vertices[*index].position.x = x + width;
-    vertices[*index].position.y = y + height;
+    vertices[*index].position.x = x_width;
+    vertices[*index].position.y = y_height;
    vertices[*index].position.z = zindex;
    vertices[*index].tex_coord.x = tex_x1;
    vertices[*index].tex_coord.y = tex_y2;
    vertices[*index].color = color_index;
    ++(*index);

-    vertices[*index].position.x = x + width - thickness;
-    vertices[*index].position.y = y + height;
+    vertices[*index].position.x = x_width - thickness;
+    vertices[*index].position.y = y_height;
    vertices[*index].position.z = zindex;
    vertices[*index].tex_coord.x = tex_x2;
    vertices[*index].tex_coord.y = tex_y1;
@ -265,8 +273,8 @@ void vertex_rect_border_create(
    ++(*index);

    // Bottom border
-    vertices[*index].position.x = x + width - thickness;
-    vertices[*index].position.y = y + height - thickness;
+    vertices[*index].position.x = x_width - thickness;
+    vertices[*index].position.y = y_height - thickness;
    vertices[*index].position.z = zindex;
    vertices[*index].tex_coord.x = tex_x2;
    vertices[*index].tex_coord.y = tex_y2;
@ -274,7 +282,7 @@ void vertex_rect_border_create(
    ++(*index);

    vertices[*index].position.x = x;
-    vertices[*index].position.y = y + height;
+    vertices[*index].position.y = y_height;
    vertices[*index].position.z = zindex;
    vertices[*index].tex_coord.x = tex_x1;
    vertices[*index].tex_coord.y = tex_y2;
@ -282,7 +290,7 @@ void vertex_rect_border_create(
    ++(*index);

    vertices[*index].position.x = x;
-    vertices[*index].position.y = y + height - thickness;
+    vertices[*index].position.y = y_height - thickness;
    vertices[*index].position.z = zindex;
    vertices[*index].tex_coord.x = tex_x2;
    vertices[*index].tex_coord.y = tex_y1;
@ -290,8 +298,8 @@ void vertex_rect_border_create(
    ++(*index);

    // Left border
-    vertices[*index].position.x = x + thickness;
-    vertices[*index].position.y = y + height - thickness;
+    vertices[*index].position.x = x_thickness;
+    vertices[*index].position.y = y_height - thickness;
    vertices[*index].position.z = zindex;
    vertices[*index].tex_coord.x = tex_x2;
    vertices[*index].tex_coord.y = tex_y2;
@ -299,15 +307,15 @@ void vertex_rect_border_create(
    ++(*index);

    vertices[*index].position.x = x;
-    vertices[*index].position.y = y + thickness;
+    vertices[*index].position.y = y_thickness;
    vertices[*index].position.z = zindex;
    vertices[*index].tex_coord.x = tex_x1;
    vertices[*index].tex_coord.y = tex_y2;
    vertices[*index].color = color_index;
    ++(*index);

-    vertices[*index].position.x = x + thickness;
-    vertices[*index].position.y = y + thickness;
+    vertices[*index].position.x = x_thickness;
+    vertices[*index].position.y = y_thickness;
    vertices[*index].position.z = zindex;
    vertices[*index].tex_coord.x = tex_x2;
    vertices[*index].tex_coord.y = tex_y1;
@ -338,13 +346,14 @@ f32 text_calculate_dimensions_height(
    f32 height,
    const Font* __restrict font, const char* __restrict text, f32 scale, int32 length
 ) {
-    f32 y = font->line_height * scale;
+    f32 line_height = font->line_height * scale;
+    f32 y = line_height;

    // @todo remember to restrict to width/height if value > 0 -> force width to remain below certain value

-    for (int i = 0; i < length; ++i) {
+    for (int32 i = 0; i < length; ++i) {
        if (text[i] == '\n') {
-            y += font->line_height * scale;
+            y += line_height;
        }
    }

@ -363,7 +372,7 @@ f32 text_calculate_dimensions_width(

    // @todo remember to restrict to width/height if value > 0 -> force width to remain below certain value

-    for (int i = 0; i < length; ++i) {
+    for (int32 i = 0; i < length; ++i) {
        int32 character = is_ascii ? text[i] : utf8_get_char_at(text, i);

        if (character == '\n') {
@ -376,13 +385,14 @@ f32 text_calculate_dimensions_width(
        Glyph* glyph = NULL;
        // We try to jump t othe correct glyph based on the glyph codepoint
        // If that doesn't work we iterate the glyph list BUT only until the last possible match (glyphs must be sorted ascending)
-        if (font->glyph_count > character - first_glyph
-            && font->glyphs[character - first_glyph].codepoint == character
+        int32 perfect_glyph_pos = character - first_glyph;
+        if (font->glyph_count > perfect_glyph_pos
+            && font->glyphs[perfect_glyph_pos].codepoint == character
        ) {
-            glyph = &font->glyphs[character - first_glyph];
+            glyph = &font->glyphs[perfect_glyph_pos];
        } else {
            // @performance consider to do binary search
-            for (int j = 0; j <= character - first_glyph && j < font->glyph_count; ++j) {
+            for (int32 j = 0; j <= perfect_glyph_pos && j < font->glyph_count; ++j) {
                if (font->glyphs[j].codepoint == character) {
                    glyph = &font->glyphs[j];

@ -406,8 +416,9 @@ void text_calculate_dimensions(
    f32* __restrict width, f32* __restrict height,
    const Font* __restrict font, const char* __restrict text, bool is_ascii, f32 scale, int32 length
 ) {
+    f32 line_height = font->line_height * scale;
    f32 x = 0;
-    f32 y = font->line_height * scale;
+    f32 y = line_height;

    f32 offset_x = 0;

@ -415,12 +426,12 @@ void text_calculate_dimensions(

    // @todo remember to restrict to width/height if value > 0 -> force width to remain below certain value

-    for (int i = 0; i < length; ++i) {
+    for (int32 i = 0; i < length; ++i) {
        int32 character = is_ascii ? text[i] : utf8_get_char_at(text, i);

        if (character == '\n') {
            x = OMS_MAX(x, offset_x);
-            y += font->line_height * scale;
+            y += line_height;

            offset_x = 0;

@ -430,13 +441,14 @@ void text_calculate_dimensions(
        Glyph* glyph = NULL;
        // We try to jump t othe correct glyph based on the glyph codepoint
        // If that doesn't work we iterate the glyph list BUT only until the last possible match (glyphs must be sorted ascending)
-        if (font->glyph_count > character - first_glyph
-            && font->glyphs[character - first_glyph].codepoint == character
+        int32 perfect_glyph_pos = character - first_glyph;
+        if (font->glyph_count > perfect_glyph_pos
+            && font->glyphs[perfect_glyph_pos].codepoint == character
        ) {
-            glyph = &font->glyphs[character - first_glyph];
+            glyph = &font->glyphs[perfect_glyph_pos];
        } else {
            // @performance consider to do binary search
-            for (int j = 0; j <= character - first_glyph && j < font->glyph_count; ++j) {
+            for (int32 j = 0; j <= perfect_glyph_pos && j < font->glyph_count; ++j) {
                if (font->glyphs[j].codepoint == character) {
                    glyph = &font->glyphs[j];

@ -465,7 +477,7 @@ f32 vertex_text_create(
 ) {
    int32 length = utf8_strlen(text);
    bool is_ascii = strlen(text) == length;
-    float scale = size / font->size;
+    f32 scale = size / font->size;

    // If we do a different alignment we need to pre-calculate the width and height
    if (align_h != 0 || align_v != 0) {
@ -493,7 +505,7 @@ f32 vertex_text_create(
    uint32 first_glyph = font->glyphs[0].codepoint;

    f32 offset_x = x;
-    for (int i = 0; i < length; ++i) {
+    for (int32 i = 0; i < length; ++i) {
        int32 character = is_ascii ? text[i] : utf8_get_char_at(text, i);
        if (character == '\n') {
            y += font->line_height * scale;
@ -505,13 +517,14 @@ f32 vertex_text_create(
        Glyph* glyph = NULL;
        // We try to jump t othe correct glyph based on the glyph codepoint
        // If that doesn't work we iterate the glyph list BUT only until the last possible match (glyphs must be sorted ascending)
-        if (font->glyph_count > character - first_glyph
-            && font->glyphs[character - first_glyph].codepoint == character
+        int32 perfect_glyph_pos = character - first_glyph;
+        if (font->glyph_count > perfect_glyph_pos
+            && font->glyphs[perfect_glyph_pos].codepoint == character
        ) {
-            glyph = &font->glyphs[character - first_glyph];
+            glyph = &font->glyphs[perfect_glyph_pos];
        } else {
            // @performance consider to do binary search
-            for (int j = 0; j <= character - first_glyph && j < font->glyph_count; ++j) {
+            for (int32 j = 0; j <= perfect_glyph_pos && j < font->glyph_count; ++j) {
                if (font->glyphs[j].codepoint == character) {
                    glyph = &font->glyphs[j];

@ -602,7 +615,7 @@ f32 ui_text_create(

    int32 length = utf8_strlen(text->value_str);
    bool is_ascii = strlen(text->value_str) == length;
-    float scale = size->value_float / theme->font.size;
+    f32 scale = size->value_float / theme->font.size;

    // If we do a different alignment we need to pre-calculate the width and height
    if (align_h != NULL || align_v != NULL) {
@ -635,7 +648,7 @@ f32 ui_text_create(
    int32 start = *index;
    f32 offset_x = x->value_int;
    f32 offset_y = y->value_int;
-    for (int i = 0; i < length; ++i) {
+    for (int32 i = 0; i < length; ++i) {
        int32 character = is_ascii ? text->value_str[i] : utf8_get_char_at(text->value_str, i);

        if (character == '\n') {
@ -648,13 +661,14 @@ f32 ui_text_create(
        Glyph* glyph = NULL;
        // We try to jump t othe correct glyph based on the glyph codepoint
        // If that doesn't work we iterate the glyph list BUT only until the last possible match (glyphs must be sorted ascending)
-        if (theme->font.glyph_count > character - first_glyph
-            && theme->font.glyphs[character - first_glyph].codepoint == character
+        int32 perfect_glyph_pos = character - first_glyph;
+        if (theme->font.glyph_count > perfect_glyph_pos
+            && theme->font.glyphs[perfect_glyph_pos].codepoint == character
        ) {
-            glyph = &theme->font.glyphs[character - first_glyph];
+            glyph = &theme->font.glyphs[perfect_glyph_pos];
        } else {
            // @performance consider to do binary search
-            for (int j = 0; j <= character - first_glyph && j < theme->font.glyph_count; ++j) {
+            for (int32 j = 0; j <= perfect_glyph_pos && j < theme->font.glyph_count; ++j) {
                if (theme->font.glyphs[j].codepoint == character) {
                    glyph = &theme->font.glyphs[j];

--- a/input/Input.h
+++ b/input/Input.h
@ -533,16 +533,18 @@ void
 input_hotkey_state(Input* input)
 {
    uint8 old_hotkeys[MAX_KEY_PRESSES];
-    memcpy(old_hotkeys, input->state.state_hotkeys, sizeof(uint8) * MAX_KEY_PRESSES);
+    InputState* state = &input->state;

-    memset(input->state.state_hotkeys, 0, sizeof(uint8) * MAX_KEY_PRESSES);
+    memcpy(old_hotkeys, state->state_hotkeys, sizeof(uint8) * MAX_KEY_PRESSES);
+
+    memset(state->state_hotkeys, 0, sizeof(uint8) * MAX_KEY_PRESSES);

    int32 active_hotkeys = 0;

    // Check every key down state
    for (int key_state = 0; key_state < MAX_KEY_STATES; ++key_state) {
-        if (input->state.state_keys[key_state].key_id == 0
-            || input->state.state_keys[key_state].key_state == KEY_STATE_RELEASED
+        if (state->state_keys[key_state].key_id == 0
+            || state->state_keys[key_state].key_state == KEY_STATE_RELEASED
        ) {
            // no key defined for this down state
            continue;
@ -551,7 +553,7 @@ input_hotkey_state(Input* input)
        // Is a key defined for this state AND is at least one hotkey defined for this key
        //      If no hotkey is defined we don't care
        //      Careful, remember MAX_MOUSE_KEYS offset
-        InputKey* key = &input->state.state_keys[key_state];
+        InputKey* key = &state->state_keys[key_state];
        int32 internal_key_id = (key->key_id & ~(INPUT_KEYBOARD_PREFIX | INPUT_CONTROLLER_PREFIX))
            + ((bool) (key->key_id & INPUT_KEYBOARD_PREFIX)) * MAX_MOUSE_KEYS
            + ((bool) (key->key_id & INPUT_CONTROLLER_PREFIX)) * (MAX_MOUSE_KEYS + MAX_KEYBOARD_KEYS);
@ -589,17 +591,17 @@ input_hotkey_state(Input* input)

                // Hotkey already active
                // @question Do we even need this? This shouldn't happen anyway?!
-                if (hotkey_is_active(input->state.state_hotkeys, hotkeys_for_key[possible_hotkey_idx])) {
+                if (hotkey_is_active(state->state_hotkeys, hotkeys_for_key[possible_hotkey_idx])) {
                    continue;
                }

                // store active hotkey, if it is not already active
-                bool is_pressed = hotkey_keys_are_active(input->state.state_keys, mapping, hotkeys_for_key[possible_hotkey_idx]);
+                bool is_pressed = hotkey_keys_are_active(state->state_keys, mapping, hotkeys_for_key[possible_hotkey_idx]);
                if (!is_pressed) {
                    continue;
                }

-                input->state.state_hotkeys[active_hotkeys] = hotkeys_for_key[possible_hotkey_idx];
+                state->state_hotkeys[active_hotkeys] = hotkeys_for_key[possible_hotkey_idx];
                ++active_hotkeys;

                // Run callback if defined
--- a/log/Debug.cpp
+++ b/log/Debug.cpp
@ -84,18 +84,17 @@ void update_timing_stat(uint32 stat, const char* function)
 {
    uint64 new_tick_count = __rdtsc();

-    debug_container->perf_stats[stat].function = function;
-    debug_container->perf_stats[stat].delta_tick = new_tick_count - debug_container->perf_stats[stat].old_tick_count;
-    debug_container->perf_stats[stat].delta_time = (double) debug_container->perf_stats[stat].delta_tick / (double) debug_container->performance_count_frequency;
-    debug_container->perf_stats[stat].old_tick_count = new_tick_count;
+    TimingStat* timing_stat = &debug_container->perf_stats[stat];
+    timing_stat->function = function;
+    timing_stat->delta_tick = new_tick_count - timing_stat->old_tick_count;
+    timing_stat->delta_time = (double) timing_stat->delta_tick / (double) debug_container->performance_count_frequency;
+    timing_stat->old_tick_count = new_tick_count;
 }

 inline
 void update_timing_stat_start(uint32 stat, const char*)
 {
-    uint64 new_tick_count = __rdtsc();
-
-    debug_container->perf_stats[stat].old_tick_count = new_tick_count;
+    debug_container->perf_stats[stat].old_tick_count = __rdtsc();
 }

 inline
@ -103,10 +102,11 @@ void update_timing_stat_end(uint32 stat, const char* function)
 {
    uint64 new_tick_count = __rdtsc();

-    debug_container->perf_stats[stat].function = function;
-    debug_container->perf_stats[stat].delta_tick = new_tick_count - debug_container->perf_stats[stat].old_tick_count;
-    debug_container->perf_stats[stat].delta_time = (double) debug_container->perf_stats[stat].delta_tick / (double) debug_container->performance_count_frequency;
-    debug_container->perf_stats[stat].old_tick_count = new_tick_count;
+    TimingStat* timing_stat = &debug_container->perf_stats[stat];
+    timing_stat->function = function;
+    timing_stat->delta_tick = new_tick_count - timing_stat->old_tick_count;
+    timing_stat->delta_time = (double) timing_stat->delta_tick / (double) debug_container->performance_count_frequency;
+    timing_stat->old_tick_count = new_tick_count;
 }

 inline
@ -114,12 +114,11 @@ void update_timing_stat_end_continued(uint32 stat, const char* function)
 {
    uint64 new_tick_count = __rdtsc();

-    debug_container->perf_stats[stat].function = function;
-    debug_container->perf_stats[stat].delta_tick = debug_container->perf_stats[stat].delta_tick
-        + new_tick_count - debug_container->perf_stats[stat].old_tick_count;
-    debug_container->perf_stats[stat].delta_time = debug_container->perf_stats[stat].delta_time
-        + (double) debug_container->perf_stats[stat].delta_tick / (double) debug_container->performance_count_frequency;
-    debug_container->perf_stats[stat].old_tick_count = new_tick_count;
+    TimingStat* timing_stat = &debug_container->perf_stats[stat];
+    timing_stat->function = function;
+    timing_stat->delta_tick = timing_stat->delta_tick + new_tick_count - timing_stat->old_tick_count;
+    timing_stat->delta_time = timing_stat->delta_time + (double) timing_stat->delta_tick / (double) debug_container->performance_count_frequency;
+    timing_stat->old_tick_count = new_tick_count;
 }

 inline
@ -172,23 +171,25 @@ void debug_memory_init(uint64 start, uint64 size)
        return;
    }

-    if (debug_container->dmc.memory_size <= debug_container->dmc.memory_element_idx) {
-        DebugMemory* old = debug_container->dmc.memory_stats;
+    DebugMemoryContainer* dmc = &debug_container->dmc;
+    if (dmc->memory_size <= dmc->memory_element_idx) {
+        DebugMemory* old = dmc->memory_stats;

-        debug_container->dmc.memory_size += 3;
-        debug_container->dmc.memory_stats = (DebugMemory *) calloc(debug_container->dmc.memory_size, sizeof(DebugMemory));
+        dmc->memory_size += 3;
+        dmc->memory_stats = (DebugMemory *) calloc(dmc->memory_size, sizeof(DebugMemory));

        if (old) {
-            memcpy(debug_container->dmc.memory_stats, old, (debug_container->dmc.memory_size - 3) * sizeof(DebugMemory));
+            memcpy(dmc->memory_stats, old, (dmc->memory_size - 3) * sizeof(DebugMemory));
            free(old);
        }
    }

-    debug_container->dmc.memory_stats[debug_container->dmc.memory_element_idx].start = start;
-    debug_container->dmc.memory_stats[debug_container->dmc.memory_element_idx].size = size;
-    debug_container->dmc.memory_stats[debug_container->dmc.memory_element_idx].usage = 0;
+    DebugMemory* debug_mem = &dmc->memory_stats[dmc->memory_element_idx];
+    debug_mem->start = start;
+    debug_mem->size = size;
+    debug_mem->usage = 0;

-    ++debug_container->dmc.memory_element_idx;
+    ++dmc->memory_element_idx;
 }

 void debug_memory_log(uint64 start, uint64 size, int32 type, const char* function)
@ -206,13 +207,14 @@ void debug_memory_log(uint64 start, uint64 size, int32 type, const char* functio
        mem->action_idx = 0;
    }

-    mem->last_action[mem->action_idx].type = type;
-    mem->last_action[mem->action_idx].start = start - mem->start;
-    mem->last_action[mem->action_idx].size = size;
+    DebugMemoryRange* dmr = &mem->last_action[mem->action_idx];
+    dmr->type = type;
+    dmr->start = start - mem->start;
+    dmr->size = size;

    // We are using rdtsc since it is faster -> less debugging overhead than using time()
-    mem->last_action[mem->action_idx].time = __rdtsc();
-    mem->last_action[mem->action_idx].function_name = function;
+    dmr->time = __rdtsc();
+    dmr->function_name = function;

    ++mem->action_idx;

@ -238,13 +240,14 @@ void debug_memory_reserve(uint64 start, uint64 size, int32 type, const char* fun
        mem->reserve_action_idx = 0;
    }

-    mem->reserve_action[mem->reserve_action_idx].type = type;
-    mem->reserve_action[mem->reserve_action_idx].start = start - mem->start;
-    mem->reserve_action[mem->reserve_action_idx].size = size;
+    DebugMemoryRange* dmr = &mem->reserve_action[mem->reserve_action_idx];
+    dmr->type = type;
+    dmr->start = start - mem->start;
+    dmr->size = size;

    // We are using rdtsc since it is faster -> less debugging overhead than using time()
-    mem->reserve_action[mem->reserve_action_idx].time = __rdtsc();
-    mem->reserve_action[mem->reserve_action_idx].function_name = function;
+    dmr->time = __rdtsc();
+    dmr->function_name = function;

    ++mem->reserve_action_idx;
 }
@ -273,29 +276,30 @@ byte* log_get_memory(uint64 size, byte aligned = 1, bool zeroed = false)
        return 0;
    }

-    ASSERT_SIMPLE(size <= debug_container->log_memory.size);
+    LogMemory* log_mem = &debug_container->log_memory;
+    ASSERT_SIMPLE(size <= log_mem->size);

    if (aligned > 1) {
-        uintptr_t address = (uintptr_t) debug_container->log_memory.memory;
-        debug_container->log_memory.pos += (aligned - ((address + debug_container->log_memory.pos) & (aligned - 1))) % aligned;
+        uintptr_t address = (uintptr_t) log_mem->memory;
+        log_mem->pos += (aligned - ((address + log_mem->pos) & (aligned - 1))) % aligned;
    }

    size = ROUND_TO_NEAREST(size, aligned);
-    if (debug_container->log_memory.pos + size > debug_container->log_memory.size) {
-        debug_container->log_memory.pos = 0;
+    if (log_mem->pos + size > log_mem->size) {
+        log_mem->pos = 0;

        if (aligned > 1) {
-            uintptr_t address = (uintptr_t) debug_container->log_memory.memory;
-            debug_container->log_memory.pos += (aligned - ((address + debug_container->log_memory.pos) & (aligned - 1))) % aligned;
+            uintptr_t address = (uintptr_t) log_mem->memory;
+            log_mem->pos += (aligned - ((address + log_mem->pos) & (aligned - 1))) % aligned;
        }
    }

-    byte* offset = (byte *) (debug_container->log_memory.memory + debug_container->log_memory.pos);
+    byte* offset = (byte *) (log_mem->memory + log_mem->pos);
    if (zeroed) {
        memset((void *) offset, 0, size);
    }

-    debug_container->log_memory.pos += size;
+    log_mem->pos += size;

    return offset;
 }
--- a/math/matrix/MatrixFloat32.h
+++ b/math/matrix/MatrixFloat32.h
@ -23,6 +23,7 @@

 // @todo Implement intrinsic versions!

+inline
 void vec2_normalize(f32* __restrict x, f32* __restrict y)
 {
    f32 d = sqrtf((*x) * (*x) + (*y) * (*y));
@ -94,6 +95,7 @@ f32 vec2_dot(const v2_f32* a, const v2_f32* b) {
    return a->x * b->x + a->y * b->y;
 }

+inline
 void vec3_normalize(f32* __restrict x, f32* __restrict y, f32* __restrict z)
 {
    f32 d = sqrtf((*x) * (*x) + (*y) * (*y) + (*z) * (*z));
@ -103,6 +105,7 @@ void vec3_normalize(f32* __restrict x, f32* __restrict y, f32* __restrict z)
    *z /= d;
 }

+inline
 void vec3_normalize(v3_f32* vec)
 {
    f32 d = sqrtf(vec->x * vec->x + vec->y * vec->y + vec->z * vec->z);
@ -179,6 +182,7 @@ void vec3_cross(v3_f32* __restrict vec, const v3_f32* a, const v3_f32* b) {
    vec->z = a->x * b->y - a->y * b->x;
 }

+inline
 f32 vec3_dot(const v3_f32* a, const v3_f32* b) {
    return a->x * b->x + a->y * b->y + a->z * b->z;
 }