diff --git a/camera/Camera.h b/camera/Camera.h
index 39eb3f8..8b0fe53 100644
--- a/camera/Camera.h
+++ b/camera/Camera.h
@@ -16,110 +16,428 @@
 
 #include "CameraMovement.h"
 
+#define CAMERA_MAX_INPUTS 4
+
+// @todo Please check out if we can switch to quaternions. We tried but failed.
+// The functions with a 2 at the end are our current backup solution which shouldn't be used (probably)
+
 struct Camera {
-    // left handed cartesian coordinates
     v3_f32 location;
     v4_f32 orientation;
 
+    v3_f32 front;
+    v3_f32 right;
+    v3_f32 up;
+    v3_f32 world_up;
+
     float speed;
     float sensitivity;
     float zoom;
+
+    float fov;
+    float znear;
+    float zfar;
+    float aspect;
 };
 
-void camera_look_at(Camera* camera, const v3_f32* at)
+void
+camera_update_vectors2(Camera* camera)
 {
+    camera->front.x = cosf(OMS_DEG2RAD(camera->orientation.x)) * cosf(OMS_DEG2RAD(camera->orientation.y));
+    camera->front.y = sinf(OMS_DEG2RAD(camera->orientation.x));
+    camera->front.z = cosf(OMS_DEG2RAD(camera->orientation.x)) * sinf(OMS_DEG2RAD(camera->orientation.y));
+    vec3_normalize_f32(&camera->front);
 
+    vec3_cross(&camera->right, &camera->front, &camera->world_up); // @bug
+    vec3_normalize_f32(&camera->right);
+
+    vec3_cross(&camera->up, &camera->right, &camera->front);
+    vec3_normalize_f32(&camera->up);
 }
 
-// you can have up to 4 camera movement inputs at the same time
-void camera_movement(Camera* camera, CameraMovement* movement, float dt)
+void
+camera_update_vectors(Camera* camera)
 {
-    f32 velocity = camera->speed * dt;
+    v3_f32 z = {0.0f, 0.0f, -1.0f};
+    quaternion_rotate_vector(&camera->front, &camera->orientation, &z);
+    vec3_normalize_f32(&camera->front);
 
-    bool has_pos = false;
-    v4_f32 pos = {};
+    vec3_cross(&camera->right, &camera->front, &camera->world_up);
+    vec3_normalize_f32(&camera->right);
 
-    bool has_view = false;
-    v3_f32 view = {};
-    v4_f32 quaternion = {};
+    vec3_cross(&camera->up, &camera->right, &camera->front);
+    vec3_normalize_f32(&camera->up);
+}
 
-    for (int i = 0; i < 4; i++) {
-        switch(movement[i]) {
-            case CAMERA_MOVEMENT_FORWARD: {
-                    pos.z = velocity;
-                    has_pos = true;
-                } break;
-            case CAMERA_MOVEMENT_BACK: {
-                    pos.z = velocity;
-                    has_pos = true;
-                } break;
-            case CAMERA_MOVEMENT_LEFT: {
-                    pos.x = velocity;
-                    has_pos = true;
-                } break;
-            case CAMERA_MOVEMENT_RIGHT: {
-                    pos.x = velocity;
-                    has_pos = true;
-                } break;
-            case CAMERA_MOVEMENT_UP: {
-                    pos.y = velocity;
-                    has_pos = true;
-                } break;
-            case CAMERA_MOVEMENT_DOWN: {
-                    pos.y = velocity;
-                    has_pos = true;
-                } break;
-            case CAMERA_MOVEMENT_PITCH_UP: {
-                    view.pitch += velocity;
-                    has_view = true;
-                } break;
-            case CAMERA_MOVEMENT_PITCH_DOWN: {
-                    view.pitch -= velocity;
-                    has_view = true;
-                } break;
-            case CAMERA_MOVEMENT_ROLL_LEFT: {
-                    view.roll += velocity;
-                    has_view = true;
-                } break;
-            case CAMERA_MOVEMENT_ROLL_RIGHT: {
-                    view.roll -= velocity;
-                    has_view = true;
-                } break;
-            case CAMERA_MOVEMENT_YAW_LEFT: {
-                    view.yaw += velocity;
-                    has_view = true;
-                } break;
-            case CAMERA_MOVEMENT_YAW_RIGHT: {
-                    view.yaw -= velocity;
-                    has_view = true;
-                } break;
-            case CAMERA_MOVEMENT_ZOOM_IN: {
-                    camera->zoom += velocity;
-                } break;
-            case CAMERA_MOVEMENT_ZOOM_OUT: {
-                    camera->zoom -= velocity;
-                } break;
-            default: {}
+void camera_rotate2(Camera* camera, float dx, float dy, float dt)
+{
+    f32 velocity = camera->sensitivity; // @todo do we need dt?
+
+    dx *= velocity;
+    dy *= velocity;
+
+    camera->orientation.x += dy;
+    camera->orientation.y += dx;
+
+    if (true) {
+        if (camera->orientation.x > 89.0f) {
+            camera->orientation.x = 89.0f;
+        } else if (camera->orientation.x < -89.0f) {
+            camera->orientation.x = -89.0f;
+        }
+
+        if (camera->orientation.y > 360.0f || camera->orientation.y < -360.0f) {
+            camera->orientation.y -= 360.0f;
         }
     }
 
-    // A position change updates the position AND the quaternion
-    if (has_pos) {
-        // @question this might be wrong/bad since pos is not a normalized vector
-        v4_f32 quat_temp = camera->orientation;
-        quaternion_rotate_euler(&camera->orientation, &quat_temp, &pos);
+    camera_update_vectors2(camera);
+}
 
-        camera->location.x += pos.x;
-        camera->location.y += pos.y;
-        camera->location.z += pos.z;
+void camera_rotate(Camera* camera, float dx, float dy, float dt)
+{
+    f32 velocity = camera->sensitivity; // @todo do we need dt?
+
+    dx *= velocity;
+    dy *= velocity;
+
+    v4_f32 yaw_quat;
+    quaternion_from_axis_angle(&yaw_quat, &camera->world_up, dx);
+
+    v4_f32 pitch_quat;
+    quaternion_from_axis_angle(&pitch_quat, &camera->right, dy);
+
+    v4_f32 result;
+    quaternion_multiply(&result, &camera->orientation, &pitch_quat);
+    quaternion_multiply(&camera->orientation, &yaw_quat, &result);
+    quaternion_unit(&camera->orientation);
+
+    // constrain pitch
+    if (true) {
+        v3_f32 euler;
+        quaternion_to_euler(&camera->orientation, &euler);
+
+        bool found_constrain = false;
+
+        float pitch = euler.x;
+        if (pitch > 89.0f) {
+            pitch = 89.0f;
+            found_constrain = true;
+        } else if (pitch < -89.0f) {
+            pitch = -89.0f;
+            found_constrain = true;
+        }
+
+        if (found_constrain) {
+            v4_f32 constrained;
+            quaternion_from_axis_angle(&constrained, &camera->right, pitch);
+            quaternion_multiply(&camera->orientation, &yaw_quat, &constrained);
+            quaternion_unit(&camera->orientation);
+        }
     }
 
-    // A view change only updates the quaternion
-    if (has_view) {
-        v4_f32 quat_temp = camera->orientation;
-        quaternion_from_euler(&quaternion, &view);
-        quaternion_multiply(&camera->orientation, &quat_temp, &quaternion);
+    camera_update_vectors(camera);
+}
+
+// you can have up to 4 camera movement inputs at the same time
+void camera_movement(Camera* camera, CameraMovement* movement, float dt, bool relative_to_world = true)
+{
+    f32 velocity = camera->speed * dt;
+
+    if (relative_to_world) {
+        for (int i = 0; i < CAMERA_MAX_INPUTS; i++) {
+            switch(movement[i]) {
+                case CAMERA_MOVEMENT_FORWARD: {
+                        camera->location.z += velocity;
+                    } break;
+                case CAMERA_MOVEMENT_BACK: {
+                        camera->location.z -= velocity;
+                    } break;
+                case CAMERA_MOVEMENT_LEFT: {
+                        camera->location.x -= velocity;
+                    } break;
+                case CAMERA_MOVEMENT_RIGHT: {
+                        camera->location.x += velocity;
+                    } break;
+                case CAMERA_MOVEMENT_UP: {
+                        camera->location.y += velocity;
+                    } break;
+                case CAMERA_MOVEMENT_DOWN: {
+                        camera->location.y -= velocity;
+                    } break;
+                case CAMERA_MOVEMENT_PITCH_UP: {
+                        camera->orientation.x += velocity;
+                    } break;
+                case CAMERA_MOVEMENT_PITCH_DOWN: {
+                        camera->orientation.x -= velocity;
+                    } break;
+                case CAMERA_MOVEMENT_ROLL_LEFT: {
+                        camera->orientation.z += velocity;
+                    } break;
+                case CAMERA_MOVEMENT_ROLL_RIGHT: {
+                        camera->orientation.z -= velocity;
+                    } break;
+                case CAMERA_MOVEMENT_YAW_LEFT: {
+                        camera->orientation.y += velocity;
+                    } break;
+                case CAMERA_MOVEMENT_YAW_RIGHT: {
+                        camera->orientation.y -= velocity;
+                    } break;
+                case CAMERA_MOVEMENT_ZOOM_IN: {
+                        camera->zoom += velocity;
+                    } break;
+                case CAMERA_MOVEMENT_ZOOM_OUT: {
+                        camera->zoom -= velocity;
+                    } break;
+                default: {}
+            }
+        }
+    } else {
+        v3_f32 forward = camera->front;
+
+        v3_f32 right;
+        vec3_cross(&right, &forward, &camera->world_up);
+        vec3_normalize_f32(&right);
+
+        v3_f32 up;
+        vec3_cross(&up, &right, &forward);
+        vec3_normalize_f32(&up);
+
+        for (int i = 0; i < CAMERA_MAX_INPUTS; i++) {
+            switch(movement[i]) {
+                case CAMERA_MOVEMENT_FORWARD: {
+                        camera->location.x += forward.x * velocity;
+                        camera->location.y += forward.y * velocity;
+                        camera->location.z += forward.z * velocity;
+                    } break;
+                case CAMERA_MOVEMENT_BACK: {
+                        camera->location.x -= forward.x * velocity;
+                        camera->location.y -= forward.y * velocity;
+                        camera->location.z -= forward.z * velocity;
+                    } break;
+                case CAMERA_MOVEMENT_LEFT: {
+                        camera->location.x -= right.x * velocity;
+                        camera->location.y -= right.y * velocity;
+                        camera->location.z -= right.z * velocity;
+                    } break;
+                case CAMERA_MOVEMENT_RIGHT: {
+                        camera->location.x += right.x * velocity;
+                        camera->location.y += right.y * velocity;
+                        camera->location.z += right.z * velocity;
+                    } break;
+                case CAMERA_MOVEMENT_UP: {
+                        camera->location.x += up.x * velocity;
+                        camera->location.y += up.y * velocity;
+                        camera->location.z += up.z * velocity;
+                    } break;
+                case CAMERA_MOVEMENT_DOWN: {
+                        camera->location.x -= up.x * velocity;
+                        camera->location.y -= up.y * velocity;
+                        camera->location.z -= up.z * velocity;
+                    } break;
+                case CAMERA_MOVEMENT_PITCH_UP: {
+                        camera->orientation.x += velocity;
+                    } break;
+                case CAMERA_MOVEMENT_PITCH_DOWN: {
+                        camera->orientation.x -= velocity;
+                    } break;
+                case CAMERA_MOVEMENT_ROLL_LEFT: {
+                        camera->orientation.z += velocity;
+                    } break;
+                case CAMERA_MOVEMENT_ROLL_RIGHT: {
+                        camera->orientation.z -= velocity;
+                    } break;
+                case CAMERA_MOVEMENT_YAW_LEFT: {
+                        camera->orientation.z += velocity;
+                    } break;
+                case CAMERA_MOVEMENT_YAW_RIGHT: {
+                        camera->orientation.z -= velocity;
+                    } break;
+                case CAMERA_MOVEMENT_ZOOM_IN: {
+                        camera->zoom += velocity;
+                    } break;
+                case CAMERA_MOVEMENT_ZOOM_OUT: {
+                        camera->zoom -= velocity;
+                    } break;
+                default: {}
+            }
+        }
     }
 }
 
+inline
+void camera_projection_matrix_lh(const Camera* __restrict camera, float* __restrict projection)
+{
+    mat4_identity_sparse(projection);
+    mat4_perspective_sparse_lh(
+        projection,
+        camera->fov,
+        camera->aspect,
+        camera->znear,
+        camera->zfar
+    );
+}
+
+inline
+void camera_projection_matrix_rh(const Camera* __restrict camera, float* __restrict projection)
+{
+    mat4_identity_sparse(projection);
+    mat4_perspective_sparse_rh(
+        projection,
+        camera->fov,
+        camera->aspect,
+        camera->znear,
+        camera->zfar
+    );
+}
+
+// This is usually not used, since it is included in the view matrix
+// expects the identity matrix
+inline
+void camera_translation_matrix_sparse(const Camera* __restrict camera, float* translation)
+{
+    translation[3] = camera->location.x;
+    translation[7] = camera->location.y;
+    translation[11] = camera->location.z;
+}
+
+// @performance this function seems worth while to fully convert to simd
+//  even if we are not really looping anything we do have some repetetive operations (rotate, dot)
+/*
+void
+camera_view_matrix_sparse(const Camera* __restrict camera, float* __restrict view)
+{
+    // @performance orientation gets converted to a quat every time, pull this out
+
+    v3_f32 up = {0.0f, 1.0f, 0.0f};
+    quaternion_rotate_active(&up, camera->orientation.pitch, camera->orientation.yaw, camera->orientation.roll);
+
+    v3_f32 right = {1.0f, 0.0f, 0.0f};
+    quaternion_rotate_active(&up, camera->orientation.pitch, camera->orientation.yaw, camera->orientation.roll);
+
+    v3_f32 forward = {0.0f, 0.0f, 1.0f};
+    quaternion_rotate_active(&forward, camera->orientation.pitch, camera->orientation.yaw, camera->orientation.roll);
+
+    view[0] = right.x;
+    view[1] = right.y;
+    view[2] = right.z;
+
+    view[4] = up.x;
+    view[5] = up.y;
+    view[6] = up.z;
+
+    view[8] = -forward.x;
+    view[9] = -forward.y;
+    view[10] = -forward.z;
+
+    // Set the translation part
+    v3_f32 right_v3 = {right.x, right.y, right.z};
+    view[3] = -v3_dot(&right_v3, &camera->location);
+
+    v3_f32 up_v3 = {up.x, up.y, up.z};
+    view[7] = -v3_dot(&up_v3, &camera->location);
+
+    v3_f32 forward_v3 = {forward.x, forward.y, forward.z};
+    view[11] = v3_dot(&forward_v3, &camera->location);
+
+    // Last element of matrix (homogeneous coordinate)
+    view[15] = 1.0f;
+}
+*/
+
+// https://github.com/g-truc/glm/blob/33b4a621a697a305bc3a7610d290677b96beb181/glm/ext/matrix_transform.inl
+// https://learnopengl.com/code_viewer_gh.php?code=includes/learnopengl/camera.h
+void
+camera_view_matrix_sparse_lh(const Camera* __restrict camera, float* __restrict view)
+{
+    // We are skipping some things because some things either get neutralized
+    //  (e.g. position - (position + front), other values are already normalized (e.g. front)
+    v3_f32 f = { camera->front.x, camera->front.y, camera->front.z };
+
+    v3_f32 s;
+    vec3_cross(&s, &camera->up, &f);
+    vec3_normalize_f32(&s);
+
+    v3_f32 u;
+    vec3_cross(&u, &f, &s);
+
+    view[0] = s.x;
+    view[1] = s.y;
+    view[2] = s.z;
+    view[3] = 0.0f;
+    view[4] = u.x;
+    view[5] = u.y;
+    view[6] = u.z;
+    view[7] = 0.0f;
+    view[8] = f.x;
+    view[9] = f.y;
+    view[10] = f.z;
+    view[11] = 0;
+    view[12] = -vec3_dot(&s, &camera->location);
+    view[13] = -vec3_dot(&u, &camera->location);
+    view[14] = -vec3_dot(&f, &camera->location);
+    view[15] = 1.0f;
+}
+
+void
+camera_view_matrix_sparse_rh(const Camera* __restrict camera, float* __restrict view)
+{
+    // We are skipping some things because some things either get neutralized
+    //  (e.g. position - (position + front), other values are already normalized (e.g. front)
+    v3_f32 f = { -camera->front.x, -camera->front.y, -camera->front.z };
+
+    v3_f32 s;
+    vec3_cross(&s, &f, &camera->up);
+    vec3_normalize_f32(&s);
+
+    v3_f32 u;
+    vec3_cross(&u, &s, &f);
+
+    view[0] = s.x;
+    view[1] = s.y;
+    view[2] = s.z;
+    view[3] = 0.0f;
+    view[4] = u.x;
+    view[5] = u.y;
+    view[6] = u.z;
+    view[7] = 0.0f;
+    view[8] = f.x;
+    view[9] = f.y;
+    view[10] = f.z;
+    view[11] = 0;
+    view[12] = -vec3_dot(&s, &camera->location);
+    view[13] = -vec3_dot(&u, &camera->location);
+    view[14] = vec3_dot(&f, &camera->location);
+    view[15] = 1.0f;
+}
+
+void
+camera_view_right_handed2(float* view)
+{
+    // Translation part
+    view[12] = view[3];
+    view[13] = view[7];
+    view[14] = view[11];
+    view[15] = 1.0f; // @todo could be removed
+
+    float temp;
+    temp = view[1];
+    view[1] = view[4];
+    view[4] = temp;
+
+    temp = view[2];
+    view[2] = view[8];
+    view[8] = -temp;
+
+    view[3] = 0.0f; // @todo could be removed
+
+    temp = view[6];
+    view[6] = view[9];
+    view[9] = -temp;
+
+    view[7] = 0.0f; // @todo could be removed
+    view[10] = -view[10];
+    view[11] = 0.0f; // @todo could be removed
+}
+
 #endif
\ No newline at end of file
diff --git a/camera/CameraMovement.h b/camera/CameraMovement.h
index 6798af1..65885e1 100644
--- a/camera/CameraMovement.h
+++ b/camera/CameraMovement.h
@@ -10,6 +10,8 @@
 #define TOS_CAMERA_MOVEMENT_H
 
 enum CameraMovement {
+    CAMERA_MOVEMENT_NONE,
+
     CAMERA_MOVEMENT_FORWARD,
     CAMERA_MOVEMENT_BACK,
 
@@ -19,6 +21,8 @@ enum CameraMovement {
     CAMERA_MOVEMENT_UP,
     CAMERA_MOVEMENT_DOWN,
 
+    CAMERA_MOVEMENT_FREE_ORIENTATION,
+
     CAMERA_MOVEMENT_PITCH_UP,
     CAMERA_MOVEMENT_PITCH_DOWN,
 
diff --git a/gpuapi/opengl/Opengl.h b/gpuapi/opengl/Opengl.h
index 2691ff4..d798c5c 100644
--- a/gpuapi/opengl/Opengl.h
+++ b/gpuapi/opengl/Opengl.h
@@ -727,6 +727,9 @@ typedef char GLchar;
 typedef ptrdiff_t GLsizeiptr;
 typedef ptrdiff_t GLintptr;
 
+// Some apis require a different sign for various operations (left/right)
+#define GPU_API_SIGN -1
+
 #if _WIN32
     #include "OpenglWin32.h"
 #else
diff --git a/gpuapi/opengl/OpenglUtils.h b/gpuapi/opengl/OpenglUtils.h
index 00ec9fd..ca2d0d7 100644
--- a/gpuapi/opengl/OpenglUtils.h
+++ b/gpuapi/opengl/OpenglUtils.h
@@ -14,6 +14,7 @@
 #include "../../utils/TestUtils.h"
 #include "../../models/Attrib.h"
 #include "../../object/Texture.h"
+#include "../../utils/StringUtils.h"
 
 #include "../RenderUtils.h"
 #include "Opengl.h"
@@ -24,64 +25,53 @@
     #include "../../platform/win32/Window.h"
 #endif
 
-/*
-struct Window {
-    bool is_fullscreen;
-    int32 width;
-    int32 height;
-    char name[32];
+inline
+void change_viewport(Window* w, int offset_x = 0, int offset_y = 0)
+{
+    glViewport(offset_x, offset_y, w->width, w->height);
+}
 
-    int32 x;
-    int32 y;
+inline
+void vsync_set(bool on)
+{
+    wglSwapIntervalEXT((int) on);
+}
 
-    GLFWwindow* hwnd_lib;
+inline
+void wireframe_mode(bool on)
+{
+    if (on) {
+        glPolygonMode(GL_FRONT_AND_BACK, GL_LINE);
+    } else {
+        glPolygonMode(GL_FRONT_AND_BACK, GL_FILL);
+    }
+}
 
-    #ifdef _WIN32
-        HWND hwnd;
-    #endif
+struct OpenglInfo {
+    char* renderer;
+    int major;
+    int minor;
 };
-*/
 
-/*
-inline
-void window_create(Window* window, void*)
+void opengl_info(OpenglInfo* info)
 {
-    //GLFWmonitor *monitor = glfwGetPrimaryMonitor();
-    window->hwnd_lib = glfwCreateWindow(
-        window->width,
-        window->height,
-        window->name,
-        NULL,
-        NULL
-    );
+    info->renderer = (char *) glGetString(GL_RENDERER);
+    info->major = 1;
+    info->minor = 0;
 
-    ASSERT_SIMPLE(window->hwnd_lib);
+    char* version = (char *) glGetString(GL_VERSION);
 
-    //glfwSetInputMode(window->hwnd_lib, GLFW_CURSOR, GLFW_CURSOR_DISABLED);
+    for (char *at = version; *at; ++at) {
+        if (*at == '.') {
+            info->major = str_to_int(version);
 
-    glfwMakeContextCurrent(window->hwnd_lib);
-    glfwWindowHint(GLFW_VISIBLE, GLFW_FALSE);
-
-    #if GLFW_EXPOSE_NATIVE_WIN32
-        window->hwnd = glfwGetWin32Window(window->hwnd_lib);
-    #endif
+            ++at;
+            info->minor = str_to_int(at);
+            break;
+        }
+    }
 }
 
-inline
-void window_open(Window* window)
-{
-    glfwMakeContextCurrent(window->hwnd_lib);
-    glViewport(window->x, window->y, window->width, window->height);
-    glfwWindowHint(GLFW_VISIBLE, GLFW_FALSE);
-}
-
-inline
-void window_close(Window* window)
-{
-    glfwWindowShouldClose(window->hwnd_lib);
-}
-*/
-
 inline
 uint32 get_texture_data_type(uint32 texture_data_type)
 {
diff --git a/gpuapi/opengl/OpenglWin32.h b/gpuapi/opengl/OpenglWin32.h
index edb2883..e71730f 100644
--- a/gpuapi/opengl/OpenglWin32.h
+++ b/gpuapi/opengl/OpenglWin32.h
@@ -1184,8 +1184,10 @@ typedef HGLRC WINAPI wgl_create_context_attribs_arb(HDC hDC, HGLRC hShareContext
 typedef BOOL WINAPI wgl_get_pixel_format_attrib_iv_arb(HDC hdc, int iPixelFormat, int iLayerPlane, UINT nAttributes, const int *piAttributes, int *piValues);
 typedef BOOL WINAPI wgl_get_pixel_format_attrib_fv_arb(HDC hdc, int iPixelFormat, int iLayerPlane, UINT nAttributes, const int *piAttributes, FLOAT *pfValues);
 typedef BOOL WINAPI wgl_choose_pixel_format_arb(HDC hdc, const int *piAttribIList, const FLOAT *pfAttribFList, UINT nMaxFormats, int *piFormats, UINT *nNumFormats);
+typedef BOOL WINAPI wgl_swap_interval_ext(int interval);
 typedef const char * WINAPI wgl_get_extensions_string_ext(void);
 
+// @question consider to make all these functions global
 struct OpenGL {
     type_glTexImage2DMultisample* glTexImage2DMultisample;
     type_glBindFramebuffer* glBindFramebuffer;
@@ -1249,9 +1251,12 @@ struct OpenGL {
 
     wgl_choose_pixel_format_arb* wglChoosePixelFormatARB;
     wgl_create_context_attribs_arb* wglCreateContextAttribsARB;
+    wgl_swap_interval_ext* wglSwapIntervalEXT;
     wgl_get_extensions_string_ext* wglGetExtensionsStringEXT;
 };
 
+static wgl_swap_interval_ext* wglSwapIntervalEXT;
+
 void set_pixel_format(HDC hdc, OpenGL* gl)
 {
     int suggested_pixel_format_idx = 0;
@@ -1368,6 +1373,8 @@ void opengl_init(Window* window, OpenGL* gl)
 
     gl->wglChoosePixelFormatARB = (wgl_choose_pixel_format_arb *) wglGetProcAddress("wglChoosePixelFormatARB");
     gl->wglCreateContextAttribsARB = (wgl_create_context_attribs_arb *) wglGetProcAddress("wglCreateContextAttribsARB");
+    gl->wglSwapIntervalEXT = (wgl_swap_interval_ext *) wglGetProcAddress("wglSwapIntervalEXT");
+    wglSwapIntervalEXT = gl->wglSwapIntervalEXT;
     gl->wglGetExtensionsStringEXT = (wgl_get_extensions_string_ext *) wglGetProcAddress("wglGetExtensionsStringEXT");
 
     set_pixel_format(window->hdc, gl);
@@ -1445,6 +1452,10 @@ void opengl_init(Window* window, OpenGL* gl)
     gl->glDrawArraysInstanced = (type_glDrawArraysInstanced *) wglGetProcAddress("glDrawArraysInstanced");
     gl->glDrawElementsInstanced = (type_glDrawElementsInstanced *) wglGetProcAddress("glDrawElementsInstanced");
 
+    if (gl->wglSwapIntervalEXT) {
+        gl->wglSwapIntervalEXT(0);
+    }
+
     // @todo now do: OpenGLInit
 }
 
diff --git a/input/Input.h b/input/Input.h
index d41b025..1c34c40 100644
--- a/input/Input.h
+++ b/input/Input.h
@@ -9,25 +9,88 @@
 #ifndef TOS_INPUT_H
 #define TOS_INPUT_H
 
+// @question Consider to change mouse to secondary input device and keyboard to primary input device and also rename the functions etc.
+
+// How many concurrent mouse/secondary input device presses to we recognize
+#define MAX_MOUSE_PRESSES 3
+
+// How many concurrent primary key/button presses can be handled?
 #define MAX_KEY_PRESSES 5
+
+// How many keys/buttons do we support for the primary input device
+#define MAX_KEYBOARD_KEYS 255
+
+// How many mouse/secondary input device keys/buttons do we support
+#define MAX_MOUSE_KEYS 5
+
 #define MIN_INPUT_DEVICES 2
 
+// How often can a key be asigned to a different hotkey
+#define MAX_KEY_TO_HOTKEY 5
+
+// How many buttons together are allowed to form a hotkey
+#define MAX_HOTKEY_COMBINATION 3
+
+// These values are used as bit flags to hint if a "key" is a keyboard/primary or mouse/secondary input
+// When adding a keybind the "key" can only be uint8 but we expand it to an int and set the first bit accordingly
+#define INPUT_KEYBOARD_PREFIX 80000000
+#define INPUT_MOUSE_PREFIX 0
+
 #define INPUT_TYPE_MOUSE_KEYBOARD 0x01
 #define INPUT_TYPE_OTHER 0x03
 
 #define MIN_CONTROLLER_DEVICES 4
 
 #include "../stdlib/Types.h"
+#include "../utils/BitUtils.h"
 
 #ifdef _WIN32
     #include <windows.h>
 #endif
 
+// @todo I'm not sure if I like the general input handling
+//      Having separate keyboard_down and mouse_down etc. is a little bit weird in the functions below
+
+struct InputMapping {
+    // A key/button can be bound to up to 5 different hotkeys
+    // This is used to check if a key/button has a hotkey association
+    uint8 keys[MAX_KEYBOARD_KEYS + MAX_MOUSE_KEYS][MAX_KEY_TO_HOTKEY];
+
+    // A hotkey can be bound to a combination of up to 3 key/button presses
+    uint8 hotkey_count;
+    uint8* hotkeys;
+};
+
+// @question Maybe we should also add a third key_down array for controllers and some special controller functions here to just handle everything in one struct
+//      Or think about completely splitting all states (mouse, keyboard, other)
 struct InputState {
+    // State of the hotkeys, resulting from the device input
+    // @question maybe create a separate define and make it a little bit larger?
+    uint8 state_hotkeys[MAX_KEY_PRESSES];
+
+    uint8 keys_down[MAX_KEY_PRESSES];
+
+    // @question Why do we even need this? shouldn't we only care about the current keys down?
+    uint8 keys_up[MAX_KEY_PRESSES];
+
+    uint32 mouse_down;
+
+    int32 dx;
+    int32 dy;
+
+    uint32 x;
+    uint32 y;
+
+    int16 wheel_delta = 0;
+    int16 hwheel_delta = 0;
+
+    uint64 keys_down_time[MAX_MOUSE_PRESSES + MAX_KEY_PRESSES];
+};
+
+struct Input {
     // Device
     bool is_connected = false;
     byte type = INPUT_TYPE_OTHER;
-    double time;
 
     #ifdef _WIN32
         // @todo maybe replace with id?!
@@ -36,46 +99,20 @@ struct InputState {
         HANDLE handle_mouse;
     #endif
 
-    // After handling the keyboard state change the game loop should set this to false
-    bool state_change_keyboard = false;
-
-    // We only consider up to 4 pressed keys
-    // Depending on the keyboard you may only be able to detect a limited amount of key presses anyway
-    int up_index;
-    uint8 keys_down_old[MAX_KEY_PRESSES];
-
-    int down_index;
-    uint8 keys_down[MAX_KEY_PRESSES];
-
-    // Mouse
-    // After handling the mouse state change the game loop should set this to false
+    bool state_change_button = false;
     bool state_change_mouse = false;
+    bool state_change_mouse_button = true;
 
-    uint32 x;
-    uint32 y;
+    bool mouse_movement;
 
-    uint32 x_last;
-    uint32 y_last;
+    InputState state;
+    InputMapping input_mapping;
 
-    // https://usb.org/sites/default/files/hid1_11.pdf Page 71 or 61 = 18
-    // the bitfield represents which button is pressed
-    uint32 mouse_down_old;
-    uint32 mouse_down;
-
-    int16 wheel_delta = 0;
-    uint32 raw_button = 0;
+    // @todo we probably don't need this
+    InputState state_old;
 };
 
-void input_transition(InputState* state)
-{
-    // Mouse
-    state->x_last = state->x;
-    state->y_last = state->y;
-
-    state->state_change_mouse = false;
-}
-
-struct ControllerState {
+struct ControllerInput {
     uint32 id = 0;
     bool is_connected = false;
 
@@ -104,4 +141,263 @@ struct ControllerState {
     bool stickr_press = false;
 };
 
+inline
+void mouse_backup_state(Input* input)
+{
+    input->state_old.mouse_down = input->state.mouse_down;
+
+    input->state_old.x = input->state.x;
+    input->state_old.y = input->state.y;
+
+    input->state_old.wheel_delta = input->state.wheel_delta;
+    input->state_old.hwheel_delta = input->state.wheel_delta;
+}
+
+inline
+void keyboard_clean_state(InputState* state)
+{
+    memset(state->keys_down, 0, MAX_KEY_PRESSES * sizeof(uint8));
+    memset(state->keys_up, 0, MAX_KEY_PRESSES * sizeof(uint8));
+    memset(state->keys_down_time, 0, (MAX_MOUSE_PRESSES + MAX_KEY_PRESSES) * sizeof(uint64));
+}
+
+inline
+void keyboard_backup_state(Input* input)
+{
+    memcpy(input->state_old.keys_down, input->state.keys_down, MAX_KEY_PRESSES * sizeof(uint8));
+    memcpy(input->state_old.keys_up, input->state.keys_up, MAX_KEY_PRESSES * sizeof(uint8));
+}
+
+inline
+bool keyboard_is_pressed(const InputState* state, byte key)
+{
+    return state->keys_down[0] == key
+        || state->keys_down[1] == key
+        || state->keys_down[2] == key
+        || state->keys_down[3] == key
+        || state->keys_down[4] == key;
+}
+
+inline
+bool keyboard_is_released(const InputState* state, byte key)
+{
+    return state->keys_up[0] == key
+        || state->keys_up[1] == key
+        || state->keys_up[2] == key
+        || state->keys_up[3] == key
+        || state->keys_up[4] == key;
+}
+
+inline
+bool keyboard_are_pressed(
+    const InputState* state,
+    byte key0, byte key1 = 0, byte key2 = 0, byte key3 = 0, byte key4 = 0
+) {
+    return (key0 != 0 && keyboard_is_pressed(state, key0))
+        && (key1 == 0 || keyboard_is_pressed(state, key1))
+        && (key2 == 0 || keyboard_is_pressed(state, key2))
+        && (key3 == 0 || keyboard_is_pressed(state, key3))
+        && (key4 == 0 || keyboard_is_pressed(state, key4));
+}
+
+// We are binding hotkeys bi-directional
+void
+input_add_hotkey(
+    InputMapping* mapping, uint8 hotkey,
+    uint32 key0, uint32 key1 = 0, uint32 key2 = 0
+)
+{
+    int count = 0;
+
+    int key0_offset = ((bool) (key0 & INPUT_KEYBOARD_PREFIX)) * MAX_MOUSE_KEYS;
+    int key1_offset = ((bool) (key1 & INPUT_KEYBOARD_PREFIX)) * MAX_MOUSE_KEYS;
+    int key2_offset = ((bool) (key2 & INPUT_KEYBOARD_PREFIX)) * MAX_MOUSE_KEYS;
+
+    key0 = key0 & ~INPUT_KEYBOARD_PREFIX;
+    key1 = key1 & ~INPUT_KEYBOARD_PREFIX;
+    key2 = key2 & ~INPUT_KEYBOARD_PREFIX;
+
+    // Define required keys for hotkey
+    if (key0 != 0) {
+        // Note: -1 since the hotkeys MUST start at 1 (0 is a special value for empty)
+        mapping->hotkeys[(hotkey - 1) * MAX_HOTKEY_COMBINATION] = (uint8) (key0 + key0_offset);
+        ++count;
+    }
+
+    if (key1 != 0) {
+        // Note: -1 since the hotkeys MUST start at 1 (0 is a special value for empty)
+        mapping->hotkeys[(hotkey - 1) * MAX_HOTKEY_COMBINATION + count] = (uint8) (key1 + key1_offset);
+        ++count;
+    }
+
+    if (key2 != 0) {
+        // Note: -1 since the hotkeys MUST start at 1 (0 is a special value for empty)
+        mapping->hotkeys[(hotkey - 1) * MAX_HOTKEY_COMBINATION + count] = (uint8) (key2 + key2_offset);
+    }
+
+    // Bind key to hotkey
+    for (int i = 0; i < MAX_KEY_TO_HOTKEY; ++i) {
+        if (key0 == 0 && key1 == 0 && key2 == 0) {
+            break;
+        }
+
+        if (key0 != 0 && mapping->keys[key0 + key0_offset - 1][i] == 0) {
+            mapping->keys[key0 + key0_offset - 1][i] = hotkey;
+            key0 = 0; // prevent adding same key again
+        }
+
+        if (key1 != 0 && mapping->keys[key1 + key1_offset - 1][i] == 0) {
+            mapping->keys[key1 + key1_offset - 1][i] = hotkey;
+            key1 = 0; // prevent adding same key again
+        }
+
+        if (key2 != 0 && mapping->keys[key2 + key2_offset - 1][i] == 0) {
+            mapping->keys[key2 + key2_offset - 1][i] = hotkey;
+            key2 = 0; // prevent adding same key again
+        }
+    }
+}
+
+inline
+bool hotkey_is_active(const InputState* state, uint8 hotkey)
+{
+    return state->state_hotkeys[0] == hotkey
+        || state->state_hotkeys[1] == hotkey
+        || state->state_hotkeys[2] == hotkey
+        || state->state_hotkeys[3] == hotkey
+        || state->state_hotkeys[4] == hotkey;
+}
+
+// similar to hotkey_is_active but instead of just performing a lookup in the input_hotkey_state created results
+// this is actively checking the current input state (not the hotkey state)
+// @performance This seems like a much better simpler solution no?
+//      However, it is probably a slower solution after calling this function many times?
+//      Remember, we would call this function for almost every possible hotkey (depending on context) per frame
+inline
+bool hotkey_is_pressed(const InputState* __restrict state, const InputMapping* __restrict mapping, uint8 hotkey)
+{
+    uint8 key0 = mapping->hotkeys[(hotkey - 1) * MAX_HOTKEY_COMBINATION];
+    uint8 key1 = mapping->hotkeys[(hotkey - 1) * MAX_HOTKEY_COMBINATION + 1];
+    uint8 key2 = mapping->hotkeys[(hotkey - 1) * MAX_HOTKEY_COMBINATION + 2];
+
+    bool is_pressed = false;
+    if (key0 > MAX_MOUSE_KEYS) {
+        key0 -= MAX_MOUSE_KEYS;
+        is_pressed = keyboard_is_pressed(state, key0);
+    } else if (key0 > 0) {
+        is_pressed = IS_BIT_SET(state->mouse_down, key0 - 1);
+    }
+
+    if (!is_pressed || key1 == 0) {
+        return is_pressed;
+    }
+
+    if (key1 > MAX_MOUSE_KEYS) {
+        key1 -= MAX_MOUSE_KEYS;
+        is_pressed &= keyboard_is_pressed(state, key1);
+    } else if (key1 > 0) {
+        is_pressed &= IS_BIT_SET(state->mouse_down, key1 - 1);
+    }
+
+    if (!is_pressed || key2 == 0) {
+        return is_pressed;
+    }
+
+    if (key2 > MAX_MOUSE_KEYS) {
+        key2 -= MAX_MOUSE_KEYS;
+        is_pressed &= keyboard_is_pressed(state, key2);
+    } else if (key2 > 0) {
+        is_pressed &= IS_BIT_SET(state->mouse_down, key2 - 1);
+    }
+
+    return is_pressed;
+}
+
+void
+input_hotkey_state(InputState* __restrict state, const InputMapping* mapping)
+{
+    // @bug isn't there a bug, MAX_KEY_PRESSES is the keyboard limit, what about additional mouse inputs?
+
+    memset(state->state_hotkeys, 0, sizeof(uint8) * MAX_KEY_PRESSES);
+
+    int i = 0;
+
+    // @performance It would be nice if we could skip this loop by checking keyboard_changed similar to the mouse loop further down
+    //      The problem is that this loop checks both mouse and keyboard
+
+    // Check every key down state
+    for (int down_state = 0; down_state < MAX_KEY_PRESSES; ++down_state) {
+        if (state->keys_down[down_state] == 0) {
+            // no key defined for this down state
+            continue;
+        }
+
+        // Is a key defined for this state AND is at least one hotkey defined for this key
+        //      If no hotkey is defined we don't care
+        //      Careful, remember MAX_MOUSE_KEYS offset
+        const uint8* hotkeys_for_key = mapping->keys[state->keys_down[down_state] + MAX_MOUSE_KEYS - 1];
+        if (hotkeys_for_key[0] == 0) {
+            // no possible hotkey associated with this key
+            continue;
+        }
+
+        // Check every possible hotkey
+        // Since multiple input devices have their own button/key indices whe have to do this weird range handling
+        for (int possible_hotkey_idx = 0; possible_hotkey_idx < MAX_KEY_TO_HOTKEY; ++possible_hotkey_idx) {
+            // We only support a slimited amount of active hotkeys
+            if (i >= MAX_KEY_PRESSES) {
+                return;
+            }
+
+            bool is_pressed = hotkey_is_pressed(state, mapping, hotkeys_for_key[possible_hotkey_idx]);
+
+            // store active hotkey, if it is not already active
+            if (is_pressed && !hotkey_is_active(state, hotkeys_for_key[possible_hotkey_idx])) {
+                state->state_hotkeys[i] = hotkeys_for_key[possible_hotkey_idx];
+                ++i;
+            }
+        }
+    }
+
+    // @performance we could also check if the mouse state even changed
+    if (state->mouse_down == 0 || i >= MAX_KEY_PRESSES) {
+        return;
+    }
+
+    // We now also need to check if there are hotkeys for the mouse buttons
+    // Some are already handled in the previous section, but some might not be handled, since they are mouse only
+    // But this also means, that we ONLY have to search for mouse only hotkeys. It's impossible to find NEW matches with keyboard keys.
+    for (int down_state = 0; down_state < MAX_MOUSE_KEYS; ++down_state) {
+        if (!IS_BIT_SET(state->mouse_down, down_state)) {
+            continue;
+        }
+
+        const uint8* hotkeys_for_key = mapping->keys[down_state];
+        if (hotkeys_for_key[0] == 0) {
+            // no possible hotkey associated with this key
+            continue;
+        }
+
+        for (int possible_hotkey_idx = 0; possible_hotkey_idx < MAX_KEY_TO_HOTKEY; ++possible_hotkey_idx) {
+            // We only support a slimited amount of active hotkeys
+            if (i >= MAX_KEY_PRESSES) {
+                return;
+            }
+
+            bool is_pressed = hotkey_is_pressed(state, mapping, hotkeys_for_key[possible_hotkey_idx]);
+
+            // store active hotkey, if it is not already active
+            if (is_pressed && !hotkey_is_active(state, hotkeys_for_key[possible_hotkey_idx])) {
+                state->state_hotkeys[i] = hotkeys_for_key[possible_hotkey_idx];
+                ++i;
+            }
+        }
+    }
+
+    // @bug how to handle long press vs click
+    // @bug how to handle priority? e.g. there might be a hotkey for 1 and one for alt+1
+    //      in this case only the hotkey for alt+1 should be triggered
+    // @bug how to handle other conditions besides buttons pressed together? some hotkeys are only available in certain situations
+}
+
 #endif
\ No newline at end of file
diff --git a/math/matrix/MatrixFloat32.h b/math/matrix/MatrixFloat32.h
index c1d3eeb..75f014c 100644
--- a/math/matrix/MatrixFloat32.h
+++ b/math/matrix/MatrixFloat32.h
@@ -15,6 +15,261 @@
 #include "../../utils/TestUtils.h"
 #include <math.h>
 
+// @todo Implement intrinsic versions!
+
+void vec2_normalize_f32(float* __restrict x, float* __restrict y)
+{
+    float d = sqrtf((*x) * (*x) + (*y) * (*y));
+
+    *x /= d;
+    *y /= d;
+}
+
+inline
+void vec2_add(v2_f32* __restrict vec, const v2_f32* a, const v2_f32* b) {
+    vec->x = a->x + b->x;
+    vec->y = a->y + b->y;
+}
+
+inline
+void vec2_add(v2_f32* __restrict vec, const v2_f32* b) {
+    vec->x += b->x;
+    vec->y += b->y;
+}
+
+inline
+void vec2_sub(v2_f32* __restrict vec, const v2_f32* a, const v2_f32* b) {
+    vec->x = a->x - b->x;
+    vec->y = a->y - b->y;
+}
+
+inline
+void vec2_sub(v2_f32* __restrict vec, const v2_f32* b) {
+    vec->x -= b->x;
+    vec->y -= b->y;
+}
+
+inline
+void vec2_mul(v2_f32* vec, const v2_f32* a, float s) {
+    vec->x = a->x * s;
+    vec->y = a->y * s;
+}
+
+inline
+void vec2_mul(v2_f32* vec, float s) {
+    vec->x *= s;
+    vec->y *= s;
+}
+
+inline
+float vec2_mul(const v2_f32* a, const v2_f32* b) {
+    return a->x * b->x + a->y * b->y;
+}
+
+inline
+void vec2_mul(v2_f32* __restrict vec, const v2_f32* a, const v2_f32* b) {
+    vec->x = a->x * b->x;
+    vec->y = a->y * b->y;
+}
+
+inline
+void vec2_mul(v2_f32* __restrict vec, const v2_f32* b) {
+    vec->x *= b->x;
+    vec->y *= b->y;
+}
+
+inline
+float vec2_cross(const v2_f32* a, const v2_f32* b) {
+    return a->x * b->y - a->y * b->x;
+}
+
+inline
+float vec2_dot(const v2_f32* a, const v2_f32* b) {
+    return a->x * b->x + a->y * b->y;
+}
+
+void vec3_normalize_f32(float* __restrict x, float* __restrict y, float* __restrict z)
+{
+    float d = sqrtf((*x) * (*x) + (*y) * (*y) + (*z) * (*z));
+
+    *x /= d;
+    *y /= d;
+    *z /= d;
+}
+
+void vec3_normalize_f32(v3_f32* vec)
+{
+    float d = sqrtf(vec->x * vec->x + vec->y * vec->y + vec->z * vec->z);
+
+    vec->x /= d;
+    vec->y /= d;
+    vec->z /= d;
+}
+
+inline
+void vec3_add(v3_f32* __restrict vec, const v3_f32* a, const v3_f32* b) {
+    vec->x = a->x + b->x;
+    vec->y = a->y + b->y;
+    vec->z = a->z + b->z;
+}
+
+inline
+void vec3_add(v3_f32* __restrict vec, const v3_f32* b) {
+    vec->x += b->x;
+    vec->y += b->y;
+    vec->z += b->z;
+}
+
+inline
+void vec3_sub(v3_f32* __restrict vec, const v3_f32* a, const v3_f32* b) {
+    vec->x = a->x - b->x;
+    vec->y = a->y - b->y;
+    vec->z = a->z - b->z;
+}
+
+inline
+void vec3_sub(v3_f32* __restrict vec, const v3_f32* b) {
+    vec->x -= b->x;
+    vec->y -= b->y;
+    vec->z -= b->z;
+}
+
+inline
+void vec3_mul(v3_f32* vec, const v3_f32* a, float s) {
+    vec->x = a->x * s;
+    vec->y = a->y * s;
+    vec->z = a->z * s;
+}
+
+inline
+void vec3_mul(v3_f32* vec, float s) {
+    vec->x *= s;
+    vec->y *= s;
+    vec->z *= s;
+}
+
+inline
+float vec3_mul(const v3_f32* a, const v3_f32* b) {
+    return a->x * b->x + a->y * b->y + a->z * b->z;
+}
+
+inline
+void vec3_mul(v3_f32* __restrict vec, const v3_f32* a, const v3_f32* b) {
+    vec->x = a->x * b->x;
+    vec->y = a->y * b->y;
+    vec->z = a->z * b->z;
+}
+
+inline
+void vec3_mul(v3_f32* __restrict vec, const v3_f32* b) {
+    vec->x *= b->x;
+    vec->y *= b->y;
+    vec->z *= b->z;
+}
+
+void vec3_cross(v3_f32* __restrict vec, const v3_f32* a, const v3_f32* b) {
+    vec->x = a->y * b->z - a->z * b->y;
+    vec->y = a->z * b->x - a->x * b->z;
+    vec->z = a->x * b->y - a->y * b->x;
+}
+
+float vec3_dot(const v3_f32* a, const v3_f32* b) {
+    return a->x * b->x + a->y * b->y + a->z * b->z;
+}
+
+void vec4_normalize_f32(float* __restrict x, float* __restrict y, float* __restrict z, float* __restrict w)
+{
+    float d = sqrtf((*x) * (*x) + (*y) * (*y) + (*z) * (*z) + (*w) * (*w));
+
+    *x /= d;
+    *y /= d;
+    *z /= d;
+    *w /= d;
+}
+
+inline
+void vec4_add(v4_f32* __restrict vec, const v4_f32* a, const v4_f32* b) {
+    vec->x = a->x + b->x;
+    vec->y = a->y + b->y;
+    vec->z = a->z + b->z;
+    vec->w = a->w + b->w;
+}
+
+inline
+void vec4_add(v4_f32* __restrict vec, const v4_f32* b) {
+    vec->x += b->x;
+    vec->y += b->y;
+    vec->z += b->z;
+    vec->w += b->w;
+}
+
+inline
+void vec4_sub(v4_f32* __restrict vec, const v4_f32* a, const v4_f32* b) {
+    vec->x = a->x - b->x;
+    vec->y = a->y - b->y;
+    vec->z = a->z - b->z;
+    vec->w = a->w - b->w;
+}
+
+inline
+void vec4_sub(v4_f32* __restrict vec, const v4_f32* b) {
+    vec->x -= b->x;
+    vec->y -= b->y;
+    vec->z -= b->z;
+    vec->w -= b->w;
+}
+
+inline
+void vec4_mul(v4_f32* vec, const v4_f32* a, float s) {
+    vec->x = a->x * s;
+    vec->y = a->y * s;
+    vec->z = a->z * s;
+    vec->w = a->w * s;
+}
+
+inline
+void vec4_mul(v4_f32* vec, float s) {
+    vec->x *= s;
+    vec->y *= s;
+    vec->z *= s;
+    vec->w *= s;
+}
+
+inline
+float vec4_mul(const v4_f32* a, const v4_f32* b) {
+    return a->x * b->x + a->y * b->y + a->z * b->z + a->w * b->w;
+}
+
+inline
+void vec4_mul(v4_f32* __restrict vec, const v4_f32* a, const v4_f32* b) {
+    vec->x = a->x * b->x;
+    vec->y = a->y * b->y;
+    vec->z = a->z * b->z;
+    vec->w = a->w * b->w;
+}
+
+inline
+void vec4_mul(v4_f32* __restrict vec, const v4_f32* b) {
+    vec->x *= b->x;
+    vec->y *= b->y;
+    vec->z *= b->z;
+    vec->w *= b->w;
+}
+
+inline
+float vec4_dot(const v4_f32* a, const v4_f32* b) {
+    return a->x * b->x + a->y * b->y + a->z * b->z + a->w * b->w;
+}
+
+inline
+void vec4_cross(v4_f32* __restrict vec, const v4_f32* a, const v4_f32* b, const v4_f32* c) {
+    vec->x = a->y * (b->z * c->w - b->w * c->z) - a->z * (b->y * c->w - b->w * c->y) + a->w * (b->y * c->z - b->z * c->y);
+    vec->y = -(a->x * (b->z * c->w - b->w * c->z) - a->z * (b->x * c->w - b->w * c->x) + a->w * (b->x * c->z - b->z * c->x));
+    vec->z = a->x * (b->y * c->w - b->w * c->y) - a->y * (b->x * c->w - b->w * c->x) + a->w * (b->x * c->y - b->y * c->x);
+    vec->w = -(a->x * (b->y * c->z - b->z * c->y) - a->y * (b->x * c->z - b->z * c->x) + a->z * (b->x * c->y - b->y * c->x));
+}
+
+inline
 void mat3_identity(float* matrix)
 {
     matrix[0] = 1.0f; matrix[1] = 0.0f; matrix[2] = 0.0f;
@@ -22,11 +277,13 @@ void mat3_identity(float* matrix)
     matrix[6] = 0.0f; matrix[7] = 0.0f; matrix[8] = 1.0f;
 }
 
+inline
 void mat3_identity_sparse(float* matrix)
 {
     matrix[0] = 1.0f; matrix[4] = 1.0f; matrix[8] = 1.0f;
 }
 
+inline
 void mat3_identity(__m128* matrix)
 {
     matrix[0] = _mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f);
@@ -34,6 +291,7 @@ void mat3_identity(__m128* matrix)
     matrix[2] = _mm_set_ps(0.0f, 0.0f, 1.0f, 0.0f);
 }
 
+inline
 void mat4_identity(float* matrix)
 {
     matrix[0] = 1.0f;  matrix[1] = 0.0f;  matrix[2] = 0.0f;  matrix[3] = 0.0f;
@@ -42,11 +300,13 @@ void mat4_identity(float* matrix)
     matrix[12] = 0.0f; matrix[13] = 0.0f; matrix[14] = 0.0f; matrix[15] = 1.0f;
 }
 
+inline
 void mat4_identity_sparse(float* matrix)
 {
     matrix[0] = 1.0f; matrix[5] = 1.0f; matrix[10] = 1.0f; matrix[15] = 1.0f;
 }
 
+inline
 void mat4_identity(__m128* matrix)
 {
     matrix[0] = _mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f);
@@ -59,7 +319,7 @@ void mat4_identity(__m128* matrix)
 // https://en.wikipedia.org/wiki/Rodrigues%27_rotation_formula
 void mat4_rotation(float* matrix, float x, float y, float z, float angle)
 {
-    ASSERT_SIMPLE(OMS_ABS(x * x + y * y + z * z - 1.0f) < 0.01)
+    ASSERT_SIMPLE(OMS_ABS(x * x + y * y + z * z - 1.0f) < 0.01);
 
     // @todo replace with quaternions
     float s = sinf(angle);
@@ -129,23 +389,16 @@ void mat4_rotation(float* matrix, float pitch, float yaw, float roll)
     matrix[15] = 1.0f;
 }
 
-void mat3vec3_mult(const float* matrix, const float* vector, float* result)
+inline
+void mat3vec3_mult(const float* __restrict matrix, const float* __restrict vector, float* __restrict result)
 {
     result[0] = matrix[0] * vector[0] + matrix[1] * vector[1] + matrix[2] * vector[2];
     result[1] = matrix[3] * vector[0] + matrix[4] * vector[1] + matrix[5] * vector[2];
     result[2] = matrix[6] * vector[0] + matrix[7] * vector[1] + matrix[8] * vector[2];
-
-    /*
-    for (int i = 0; i < 3; ++i) {
-        result[i] = matrix[i * 3 + 0] * vector[0]
-            + matrix[i * 3 + 1] * vector[1]
-            + matrix[i * 3 + 2] * vector[2];
-    }
-    */
 }
 
 // @question could simple mul add sse be faster?
-void mat3vec3_mult_sse(const float* matrix, const float* vector, float* result)
+void mat3vec3_mult_sse(const float* __restrict matrix, const float* __restrict vector, float* __restrict result)
 {
     __m128 vec = _mm_loadu_ps(vector);
     vec = _mm_insert_ps(vec, _mm_setzero_ps(), 0x30); // vec[3] = 0
@@ -161,7 +414,7 @@ void mat3vec3_mult_sse(const float* matrix, const float* vector, float* result)
 }
 
 // @question could simple mul add sse be faster?
-void mat3vec3_mult_sse(const __m128* matrix, const __m128* vector, float* result)
+void mat3vec3_mult_sse(const __m128* __restrict matrix, const __m128* __restrict vector, float* __restrict result)
 {
     for (int i = 0; i < 3; ++i) {
         __m128 dot = _mm_dp_ps(matrix[i], *vector, 0xF1);
@@ -171,14 +424,15 @@ void mat3vec3_mult_sse(const __m128* matrix, const __m128* vector, float* result
 }
 
 // @question could simple mul add sse be faster?
-void mat3vec3_mult_sse(const __m128* matrix, const __m128* vector, __m128* result)
+void mat3vec3_mult_sse(const __m128* __restrict matrix, const __m128* __restrict vector, __m128* __restrict result)
 {
     for (int i = 0; i < 4; ++i) {
         result[i] = _mm_dp_ps(matrix[i], *vector, 0xF1);
     }
 }
 
-void mat4vec4_mult(const float* matrix, const float* vector, float* result)
+inline
+void mat4vec4_mult(const float* __restrict matrix, const float* __restrict vector, float* __restrict result)
 {
     result[0] = matrix[0] * vector[0] + matrix[1] * vector[1] + matrix[2] * vector[2] + matrix[3] * vector[3];
     result[1] = matrix[4] * vector[0] + matrix[5] * vector[1] + matrix[6] * vector[2] + matrix[7] * vector[3];
@@ -187,7 +441,7 @@ void mat4vec4_mult(const float* matrix, const float* vector, float* result)
 }
 
 // @question could simple mul add sse be faster?
-void mat4vec4_mult_sse(const float* matrix, const float* vector, float* result)
+void mat4vec4_mult_sse(const float* __restrict matrix, const float* __restrict vector, float* __restrict result)
 {
     __m128 vec = _mm_loadu_ps(vector);
 
@@ -200,7 +454,7 @@ void mat4vec4_mult_sse(const float* matrix, const float* vector, float* result)
 }
 
 // @question could simple mul add sse be faster?
-void mat4vec4_mult_sse(const __m128* matrix, const __m128* vector, float* result)
+void mat4vec4_mult_sse(const __m128* __restrict matrix, const __m128* __restrict vector, float* __restrict result)
 {
     for (int i = 0; i < 4; ++i) {
         __m128 dot = _mm_dp_ps(matrix[i], *vector, 0xF1);
@@ -210,14 +464,38 @@ void mat4vec4_mult_sse(const __m128* matrix, const __m128* vector, float* result
 }
 
 // @question could simple mul add sse be faster?
-void mat4vec4_mult_sse(const __m128* matrix, const __m128* vector, __m128* result)
+void mat4vec4_mult_sse(const __m128* __restrict matrix, const __m128* __restrict vector, __m128* __restrict result)
 {
     for (int i = 0; i < 4; ++i) {
         result[i] = _mm_dp_ps(matrix[i], *vector, 0xF1);
     }
 }
 
-void mat4mat4_mult(const float* a, const float* b, float* result, int steps = 8)
+inline
+void mat4mat4_mult(const float* __restrict a, const float* __restrict b, float* __restrict result)
+{
+    result[0] = a[0] * b[0] + a[1] * b[4] + a[2] * b[8] + a[3] * b[12];
+    result[1] = a[0] * b[1] + a[1] * b[5] + a[2] * b[9] + a[3] * b[13];
+    result[2] = a[0] * b[2] + a[1] * b[6] + a[2] * b[10] + a[3] * b[14];
+    result[3] = a[0] * b[3] + a[1] * b[7] + a[2] * b[11] + a[3] * b[15];
+
+    result[4] = a[4] * b[0] + a[5] * b[4] + a[6] * b[8] + a[7] * b[12];
+    result[5] = a[4] * b[1] + a[5] * b[5] + a[6] * b[9] + a[7] * b[13];
+    result[6] = a[4] * b[2] + a[5] * b[6] + a[6] * b[10] + a[7] * b[14];
+    result[7] = a[4] * b[3] + a[5] * b[7] + a[6] * b[11] + a[7] * b[15];
+
+    result[8] = a[8] * b[0] + a[9] * b[4] + a[10] * b[8] + a[11] * b[12];
+    result[9] = a[8] * b[1] + a[9] * b[5] + a[10] * b[9] + a[11] * b[13];
+    result[10] = a[8] * b[2] + a[9] * b[6] + a[10] * b[10] + a[11] * b[14];
+    result[11] = a[8] * b[3] + a[9] * b[7] + a[10] * b[11] + a[11] * b[15];
+
+    result[12] = a[12] * b[0] + a[13] * b[4] + a[14] * b[8] + a[15] * b[12];
+    result[13] = a[12] * b[1] + a[13] * b[5] + a[14] * b[9] + a[15] * b[13];
+    result[14] = a[12] * b[2] + a[13] * b[6] + a[14] * b[10] + a[15] * b[14];
+    result[15] = a[12] * b[3] + a[13] * b[7] + a[14] * b[11] + a[15] * b[15];
+}
+
+void mat4mat4_mult(const float* __restrict a, const float* __restrict b, float* __restrict result, int steps)
 {
     if (steps > 1) {
         // @todo check http://fhtr.blogspot.com/2010/02/4x4-float-matrix-multiplication-using.html
@@ -286,29 +564,11 @@ void mat4mat4_mult(const float* a, const float* b, float* result, int steps = 8)
             )
         );
     } else {
-        result[0] = a[0] * b[0] + a[1] * b[4] + a[2] * b[8] + a[3] * b[12];
-        result[1] = a[0] * b[1] + a[1] * b[5] + a[2] * b[9] + a[3] * b[13];
-        result[2] = a[0] * b[2] + a[1] * b[6] + a[2] * b[10] + a[3] * b[14];
-        result[3] = a[0] * b[3] + a[1] * b[7] + a[2] * b[11] + a[3] * b[15];
-
-        result[4] = a[4] * b[0] + a[5] * b[4] + a[6] * b[8] + a[7] * b[12];
-        result[5] = a[4] * b[1] + a[5] * b[5] + a[6] * b[9] + a[7] * b[13];
-        result[6] = a[4] * b[2] + a[5] * b[6] + a[6] * b[10] + a[7] * b[14];
-        result[7] = a[4] * b[3] + a[5] * b[7] + a[6] * b[11] + a[7] * b[15];
-
-        result[8] = a[8] * b[0] + a[9] * b[4] + a[10] * b[8] + a[11] * b[12];
-        result[9] = a[8] * b[1] + a[9] * b[5] + a[10] * b[9] + a[11] * b[13];
-        result[10] = a[8] * b[2] + a[9] * b[6] + a[10] * b[10] + a[11] * b[14];
-        result[11] = a[8] * b[3] + a[9] * b[7] + a[10] * b[11] + a[11] * b[15];
-
-        result[12] = a[12] * b[0] + a[13] * b[4] + a[14] * b[8] + a[15] * b[12];
-        result[13] = a[12] * b[1] + a[13] * b[5] + a[14] * b[9] + a[15] * b[13];
-        result[14] = a[12] * b[2] + a[13] * b[6] + a[14] * b[10] + a[15] * b[14];
-        result[15] = a[12] * b[3] + a[13] * b[7] + a[14] * b[11] + a[15] * b[15];
+        mat4mat4_mult(a, b, result);
     }
 }
 
-void mat4mat4_mult_sse(const __m128* a, const __m128* b_transposed, float* result)
+void mat4mat4_mult_sse(const __m128* __restrict a, const __m128* __restrict b_transposed, float* __restrict result)
 {
     __m128 dot;
 
@@ -366,7 +626,8 @@ void mat4mat4_mult_sse(const __m128* a, const __m128* b_transposed, float* resul
     result[15] = _mm_cvtss_f32(dot);
 }
 
-void mat4mat4_mult_sse(const __m128* a, const __m128* b_transpose, __m128* result)
+inline
+void mat4mat4_mult_sse(const __m128* __restrict a, const __m128* __restrict b_transpose, __m128* __restrict result)
 {
     for (int i = 0; i < 4; ++i) {
         result[i] = _mm_mul_ps(a[0], b_transpose[i]);
@@ -418,7 +679,7 @@ void mat4_frustum_planes(float planes[6][4], float radius, float *matrix) {
     planes[5][3] = zfar * m[15] - m[14];
 }
 
-void mat4_frustum_sparse(
+void mat4_frustum_sparse_rh(
     float *matrix,
     float left, float right, float bottom, float top,
     float znear, float zfar
@@ -450,8 +711,40 @@ void mat4_frustum_sparse(
     //matrix[15] = 0.0f;
 }
 
+void mat4_frustum_sparse_lh(
+    float *matrix,
+    float left, float right, float bottom, float top,
+    float znear, float zfar
+ ) {
+    float temp, temp2, temp3, temp4;
+    temp = 2.0f * znear;
+    temp2 = right - left;
+    temp3 = top - bottom;
+    temp4 = zfar - znear;
+
+    matrix[0] = temp / temp2;
+    //matrix[1] = 0.0f;
+    //matrix[2] = 0.0f;
+    //matrix[3] = 0.0f;
+
+    //matrix[4] = 0.0f;
+    matrix[5] = temp / temp3;
+    //matrix[6] = 0.0f;
+    //matrix[7] = 0.0f;
+
+    matrix[8] = (right + left) / temp2;
+    matrix[9] = (top + bottom) / temp3;
+    matrix[10] = (zfar + znear) / temp4;
+    matrix[11] = 1.0f;
+
+    //matrix[12] = 0.0f;
+    //matrix[13] = 0.0f;
+    matrix[14] = (temp * zfar) / temp4;
+    //matrix[15] = 0.0f;
+}
+
 // fov needs to be in rad
-void mat4_perspective_sparse(
+void mat4_perspective_sparse_lh(
     float *matrix, float fov, float aspect,
     float znear, float zfar)
 {
@@ -461,7 +754,20 @@ void mat4_perspective_sparse(
     ymax = znear * tanf(fov * 0.5f);
     xmax = ymax * aspect;
 
-    mat4_frustum_sparse(matrix, -xmax, xmax, -ymax, ymax, znear, zfar);
+    mat4_frustum_sparse_lh(matrix, -xmax, xmax, -ymax, ymax, znear, zfar);
+}
+
+void mat4_perspective_sparse_rh(
+    float *matrix, float fov, float aspect,
+    float znear, float zfar)
+{
+    ASSERT_SIMPLE(znear > 0.0f);
+
+    float ymax, xmax;
+    ymax = znear * tanf(fov * 0.5f);
+    xmax = ymax * aspect;
+
+    mat4_frustum_sparse_rh(matrix, -xmax, xmax, -ymax, ymax, znear, zfar);
 }
 
 void mat4_ortho(
@@ -494,7 +800,21 @@ void mat4_ortho(
     matrix[15] = 1.0f;
 }
 
-void mat4_translate(float* matrix, float dx, float dy, float dz, int steps = 8)
+void mat4_translate(float* matrix, float dx, float dy, float dz)
+{
+    float temp[16];
+    memcpy(temp, matrix, sizeof(float) * 16);
+
+    float translation_matrix[16];
+    translation_matrix[0] = 1.0f;   translation_matrix[1] = 0.0f;   translation_matrix[2] = 0.0f;   translation_matrix[3] = dx;
+    translation_matrix[4] = 0.0f;   translation_matrix[5] = 1.0f;   translation_matrix[6] = 0.0f;   translation_matrix[7] = dy;
+    translation_matrix[8] = 0.0f;   translation_matrix[9] = 0.0f;   translation_matrix[10] = 1.0f;  translation_matrix[11] = dz;
+    translation_matrix[12] = 0.0f; translation_matrix[13] = 0.0f; translation_matrix[14] = 0.0f; translation_matrix[15] = 1.0f;
+
+    mat4mat4_mult(temp, translation_matrix, matrix);
+}
+
+void mat4_translate(float* matrix, float dx, float dy, float dz, int steps)
 {
     alignas(64) float temp[16];
     memcpy(temp, matrix, sizeof(float) * 16);
@@ -505,9 +825,10 @@ void mat4_translate(float* matrix, float dx, float dy, float dz, int steps = 8)
     translation_matrix[8] = 0.0f;   translation_matrix[9] = 0.0f;   translation_matrix[10] = 1.0f;  translation_matrix[11] = dz;
     translation_matrix[12] = 0.0f; translation_matrix[13] = 0.0f; translation_matrix[14] = 0.0f; translation_matrix[15] = 1.0f;
 
-    mat4mat4_mult(temp, translation_matrix, matrix, 1);
+    mat4mat4_mult(temp, translation_matrix, matrix, steps);
 }
 
+inline
 void mat4_translation(float* matrix, float dx, float dy, float dz)
 {
     matrix[0] = 1.0f;   matrix[1] = 0.0f;   matrix[2] = 0.0f;   matrix[3] = dx;
@@ -516,6 +837,7 @@ void mat4_translation(float* matrix, float dx, float dy, float dz)
     matrix[12] = 0.0f; matrix[13] = 0.0f; matrix[14] = 0.0f; matrix[15] = 1.0f;
 }
 
+inline
 void mat4_translation_sparse(float* matrix, float dx, float dy, float dz)
 {
     matrix[3] = dx;
@@ -523,92 +845,112 @@ void mat4_translation_sparse(float* matrix, float dx, float dy, float dz)
     matrix[11] = dz;
 }
 
-// @todo unroll these loops below
-void mat4_transpose(const float* matrix, float* transposed)
+inline
+void mat4_scale(float* matrix, float dx, float dy, float dz)
 {
-    for (int i = 0; i < 4; ++i) {
-        for (int j = i + 1; j < 4; ++j) {
-            int index1 = i * 4 + j;
-            int index2 = j * 4 + i;
-
-            transposed[index1] = transposed[index2];
-            transposed[index2] = matrix[index1];
-        }
-    }
+    matrix[0] = dx;   matrix[1] = 0.0f;   matrix[2] = 0.0f;   matrix[3] = 0.0f;
+    matrix[4] = 0.0f;   matrix[5] = dy;   matrix[6] = 0.0f;   matrix[7] = 0.0f;
+    matrix[8] = 0.0f;   matrix[9] = 0.0f;   matrix[10] = dz;  matrix[11] = 0.0f;
+    matrix[12] = 0.0f; matrix[13] = 0.0f; matrix[14] = 0.0f; matrix[15] = 1.0f;
 }
 
+inline
+void mat4_scale_sparse(float* matrix, float dx, float dy, float dz)
+{
+    matrix[0] = dx;
+    matrix[5] = dy;
+    matrix[10] = dz;
+}
+
+inline
+void mat4_transpose(const float* __restrict matrix, float* __restrict transposed)
+{
+    transposed[1] = matrix[4];
+    transposed[2] = matrix[8];
+    transposed[3] = matrix[12];
+    transposed[4] = matrix[1];
+    transposed[6] = matrix[9];
+    transposed[7] = matrix[13];
+    transposed[8] = matrix[2];
+    transposed[9] = matrix[6];
+    transposed[11] = matrix[14];
+    transposed[12] = matrix[3];
+    transposed[13] = matrix[7];
+    transposed[14] = matrix[11];
+}
+
+inline
 void mat4_transpose(float* matrix)
 {
     float temp;
 
-    for (int i = 0; i < 4; ++i) {
-        for (int j = i + 1; j < 4; ++j) {
-            int index1 = i * 4 + j;
-            int index2 = j * 4 + i;
+    temp = matrix[1];
+    matrix[1] = matrix[4];
+    matrix[4] = temp;
 
-            temp = matrix[index1];
-            matrix[index1] = matrix[index2];
-            matrix[index2] = temp;
-        }
-    }
+    temp = matrix[2];
+    matrix[2] = matrix[8];
+    matrix[8] = temp;
+
+    temp = matrix[3];
+    matrix[3] = matrix[12];
+    matrix[12] = temp;
+
+    temp = matrix[6];
+    matrix[6] = matrix[9];
+    matrix[9] = temp;
+
+    temp = matrix[7];
+    matrix[7] = matrix[13];
+    matrix[13] = temp;
+
+    temp = matrix[11];
+    matrix[11] = matrix[14];
+    matrix[14] = temp;
 }
 
-void mat3_transpose(const float* matrix, float* transposed)
+inline
+void mat3_transpose(const float* __restrict matrix, float* __restrict transposed)
 {
-    for (int i = 0; i < 3; ++i) {
-        for (int j = i + 1; j < 3; ++j) {
-            int index1 = i * 3 + j;
-            int index2 = j * 3 + i;
-
-            transposed[index1] = transposed[index2];
-            transposed[index2] = matrix[index1];
-        }
-    }
+    transposed[1] = matrix[3];
+    transposed[2] = matrix[6];
+    transposed[3] = matrix[1];
+    transposed[5] = matrix[7];
+    transposed[6] = matrix[2];
+    transposed[7] = matrix[5];
 }
 
+inline
 void mat3_transpose(float* matrix)
 {
     float temp;
 
-    for (int i = 0; i < 3; ++i) {
-        for (int j = i + 1; j < 3; ++j) {
-            int index1 = i * 3 + j;
-            int index2 = j * 3 + i;
+    temp = matrix[1];
+    matrix[1] = matrix[3];
+    matrix[3] = temp;
 
-            temp = matrix[index1];
-            matrix[index1] = matrix[index2];
-            matrix[index2] = temp;
-        }
-    }
+    temp = matrix[2];
+    matrix[2] = matrix[6];
+    matrix[6] = temp;
+
+    temp = matrix[5];
+    matrix[5] = matrix[7];
+    matrix[7] = temp;
 }
 
-void mat2_transpose(const float* matrix, float* transposed)
+inline
+void mat2_transpose(const float* __restrict matrix, float* __restrict transposed)
 {
-    for (int i = 0; i < 2; ++i) {
-        for (int j = i + 1; j < 2; ++j) {
-            int index1 = i * 2 + j;
-            int index2 = j * 2 + i;
-
-            transposed[index1] = transposed[index2];
-            transposed[index2] = matrix[index1];
-        }
-    }
+    transposed[1] = matrix[2];
+    transposed[2] = matrix[1];
 }
 
+inline
 void mat2_transpose(float* matrix)
 {
-    float temp;
-
-    for (int i = 0; i < 2; ++i) {
-        for (int j = i + 1; j < 2; ++j) {
-            int index1 = i * 2 + j;
-            int index2 = j * 2 + i;
-
-            temp = matrix[index1];
-            matrix[index1] = matrix[index2];
-            matrix[index2] = temp;
-        }
-    }
+    float temp = matrix[1];
+    matrix[1] = matrix[2];
+    matrix[2] = temp;
 }
 
 #endif
\ No newline at end of file
diff --git a/math/matrix/QuaternionFloat32.h b/math/matrix/QuaternionFloat32.h
index efbd9d4..70e5004 100644
--- a/math/matrix/QuaternionFloat32.h
+++ b/math/matrix/QuaternionFloat32.h
@@ -15,6 +15,8 @@
 #include "../../utils/MathUtils.h"
 #include "../../utils/TestUtils.h"
 
+// @todo Remove unused functions there are a lot (AFTER you implemented quaternion handling in the camera)
+
 inline
 void quaternion_unit(v4_f32* quat)
 {
@@ -29,17 +31,17 @@ void quaternion_unit(v4_f32* quat)
 inline
 void quaternion_from_euler(v4_f32* quat, float pitch, float yaw, float roll)
 {
-    float y = OMS_RAD2DEG(yaw * 0.5f);
-    float cy = cosf(y);
-    float sy = sinf(y);
+    float y = OMS_DEG2RAD(yaw);
+    float cy = cosf(y / 2);
+    float sy = sinf(y / 2);
 
-    float p = OMS_RAD2DEG(pitch * 0.5f);
-    float cp = cosf(p);
-    float sp = sinf(p);
+    float p = OMS_DEG2RAD(pitch);
+    float cp = cosf(p / 2);
+    float sp = sinf(p / 2);
 
-    float r = OMS_RAD2DEG(roll * 0.5f);
-    float cr = cosf(r);
-    float sr = sinf(r);
+    float r = OMS_DEG2RAD(roll);
+    float cr = cosf(r / 2);
+    float sr = sinf(r / 2);
 
     quat->w = cr * cp * cy + sr * sp * sy;
     quat->x = sr * cp * cy - cr * sp * sy;
@@ -50,9 +52,9 @@ void quaternion_from_euler(v4_f32* quat, float pitch, float yaw, float roll)
 }
 
 inline
-void quaternion_from_euler(v4_f32* quat, const v3_f32* v)
+void quaternion_from_euler(v4_f32* __restrict quat, const v3_f32* __restrict v)
 {
-    float y = OMS_RAD2DEG(v->v * 0.5f);
+    float y = OMS_RAD2DEG(v->v / 2);
     float cy = cosf(y);
     float sy = sinf(y);
 
@@ -70,15 +72,27 @@ void quaternion_from_euler(v4_f32* quat, const v3_f32* v)
     quat->z = cr * cp * sy - sr * sp * cy;
 }
 
-void quaternion_to_euler(const v4_f32* quat, v3_f32* v) {
+inline
+void quaternion_from_axis_angle(v4_f32* quat, const v3_f32* __restrict axis, float rad) {
+    float half_angle = rad / 2.0f;
+    float s = sinf(half_angle);
+
+    quat->x = axis->x * s;
+    quat->y = axis->y * s;
+    quat->z = axis->z * s;
+    quat->w = cosf(half_angle);
+
+    quaternion_unit(quat);
+}
+
+void quaternion_to_euler(const v4_f32* __restrict quat, v3_f32* __restrict v) {
     // Pitch
     float sinp = 2.0f * (quat->w * quat->x + quat->y * quat->z);
     float cosp = 1.0f - 2.0f * (quat->x * quat->x + quat->y * quat->y);
     v->pitch = atan2f(sinp, cosp);
 
     // Check for gimbal lock
-    float sinp_check = 2.0f * (quat->w * quat->x + quat->y * quat->z);
-    if (OMS_ABS(sinp_check) >= 0.9999f) {
+    if (OMS_ABS(sinp) >= 0.9999f) {
         v->yaw = atan2f(quat->x * quat->z - quat->w * quat->y, quat->w * quat->x + quat->y * quat->z);
         v->roll = 0.0f;
     } else {
@@ -93,15 +107,15 @@ void quaternion_to_euler(const v4_f32* quat, v3_f32* v) {
     }
 }
 
-void quaternion_multiply(v4_f32* quat, const v4_f32* quat1, const v4_f32* quat2)
+void quaternion_multiply(v4_f32* __restrict quat, const v4_f32* __restrict quat1, const v4_f32* __restrict quat2)
 {
-    quat->w = quat1->w * quat2->w - quat1->x * quat2->x - quat1->y * quat2->y - quat1->z * quat2->z;
     quat->x = quat1->w * quat2->x + quat1->x * quat2->w + quat1->y * quat2->z - quat1->z * quat2->y;
     quat->y = quat1->w * quat2->y - quat1->x * quat2->z + quat1->y * quat2->w + quat1->z * quat2->x;
     quat->z = quat1->w * quat2->z + quat1->x * quat2->y - quat1->y * quat2->x + quat1->z * quat2->w;
+    quat->w = quat1->w * quat2->w - quat1->x * quat2->x - quat1->y * quat2->y - quat1->z * quat2->z;
 }
 
-void quaternion_inverse(v4_f32* quat, const v4_f32* quat_origin) {
+void quaternion_inverse(v4_f32* __restrict quat, const v4_f32* __restrict quat_origin) {
     float norm = quat_origin->w * quat_origin->w
         + quat_origin->x * quat_origin->x
         + quat_origin->y * quat_origin->y
@@ -114,7 +128,7 @@ void quaternion_inverse(v4_f32* quat, const v4_f32* quat_origin) {
 }
 
 inline
-void quaternion_to_rotation(f32* matrix, const v4_f32* quat)
+void quaternion_to_rotation(f32* __restrict matrix, const v4_f32* __restrict quat)
 {
     matrix[0] = 1.0f - 2.0f * (quat->y * quat->y + quat->z * quat->z);
     matrix[1] = 2.0f * (quat->x * quat->y - quat->z * quat->w);
@@ -138,7 +152,7 @@ void quaternion_to_rotation(f32* matrix, const v4_f32* quat)
 }
 
 inline
-void quaternion_to_rotation(f32* matrix, const v4_f32* quat)
+void quaternion_to_rotation_sparse(f32* __restrict matrix, const v4_f32* __restrict quat)
 {
     matrix[0] = 1.0f - 2.0f * (quat->y * quat->y + quat->z * quat->z);
     matrix[1] = 2.0f * (quat->x * quat->y - quat->z * quat->w);
@@ -154,7 +168,7 @@ void quaternion_to_rotation(f32* matrix, const v4_f32* quat)
 }
 
 inline
-void quaternion_from_vec(v4_f32* quat, const v4_f32* vec)
+void quaternion_from_vec(v4_f32* __restrict quat, const v4_f32* __restrict vec)
 {
     quat->x = vec->x;
     quat->y = vec->y;
@@ -163,7 +177,7 @@ void quaternion_from_vec(v4_f32* quat, const v4_f32* vec)
 }
 
 inline
-void quaternion_from_vec(v4_f32* quat, const v3_f32* vec)
+void quaternion_from_vec(v4_f32* __restrict quat, const v3_f32* __restrict vec)
 {
     quat->x = vec->x;
     quat->y = vec->y;
@@ -172,7 +186,7 @@ void quaternion_from_vec(v4_f32* quat, const v3_f32* vec)
 }
 
 inline
-void quaternion_to_vec(v4_f32* vec, const v4_f32* quat)
+void quaternion_to_vec(v4_f32* __restrict vec, const v4_f32* __restrict quat)
 {
     vec->x = quat->x;
     vec->y = quat->y;
@@ -181,18 +195,32 @@ void quaternion_to_vec(v4_f32* vec, const v4_f32* quat)
 }
 
 inline
-void quaternion_to_vec(v3_f32* vec, const v4_f32* quat)
+void quaternion_to_vec(v3_f32* __restrict vec, const v4_f32* __restrict quat)
 {
     vec->x = quat->x;
     vec->y = quat->y;
     vec->z = quat->z;
 }
 
+inline
+void quaternion_rotate_vector(v3_f32* __restrict vec, const v4_f32* __restrict quat, v3_f32* __restrict a)
+{
+    // @todo consider to not create this variable and cast quat to a v3_f32 pointer in cross! (the order is correct)
+    v3_f32 q2 = {quat->x, quat->y, quat->z};
+
+    v3_f32 cross;
+    vec3_cross(&cross, &q2, a);
+
+    vec->x = a->x + 2.0f * cross.x * quat->w + q2.y * cross.z - q2.z * cross.y;
+    vec->y = a->y + 2.0f * cross.y * quat->w + q2.z * cross.x - q2.x * cross.z;
+    vec->z = a->z + 2.0f * cross.z * quat->w + q2.x * cross.y - q2.y * cross.x;
+}
+
 // active = point rotated respective to coordinate system
 inline
-void quaternion_rotate_active(v4_f32* p, const v4_f32* quat, const v4_f32* quat_inv)
+void quaternion_rotate_active(v4_f32* __restrict p, const v4_f32* __restrict quat, const v4_f32* __restrict quat_inv)
 {
-    ASSERT_SIMPLE(OMS_ABS(x * x + y * y + z * z + w * z - 1.0f) < 0.01);
+    //ASSERT_SIMPLE(OMS_ABS(x * x + y * y + z * z + w * z - 1.0f) < 0.01);
 
     v4_f32 p_tmp;
     quaternion_multiply(&p_tmp, quat_inv, p);
@@ -201,9 +229,9 @@ void quaternion_rotate_active(v4_f32* p, const v4_f32* quat, const v4_f32* quat_
 
 // passive = coordinate system is rotated
 inline
-void quaternion_rotate_passive(v4_f32* p, const v4_f32* quat, const v4_f32* quat_inv)
+void quaternion_rotate_passive(v4_f32* __restrict p, const v4_f32* __restrict quat, const v4_f32* __restrict quat_inv)
 {
-    ASSERT_SIMPLE(OMS_ABS(x * x + y * y + z * z + w * w - 1.0f) < 0.01);
+    //ASSERT_SIMPLE(OMS_ABS(x * x + y * y + z * z + w * w - 1.0f) < 0.01);
 
     v4_f32 p_tmp;
     quaternion_multiply(&p_tmp, quat, p);
@@ -218,7 +246,7 @@ void quaternion_rotate_passive(v4_f32* p, const v4_f32* quat, const v4_f32* quat
 // 5. call quat_rotate_*
 // 6. convert quat to vec
 // @todo Since this is usually done on multiple vecs, we should probably accept an array of vecs and then use simd
-void quaternion_rotate_active(v4_f32* vec, float pitch, float yaw, float roll)
+void quaternion_rotate_active(v3_f32* vec, float pitch, float yaw, float roll)
 {
     v4_f32 q;
     quaternion_from_euler(&q, pitch, yaw, roll); // q is already in unit length
@@ -226,15 +254,16 @@ void quaternion_rotate_active(v4_f32* vec, float pitch, float yaw, float roll)
     v4_f32 q_inv;
     quaternion_inverse(&q_inv, &q);
 
-    v4_f32 p;
-    quaternion_from_vec(&p, vec);
+    v4_f32 p = { vec->x, vec->y, vec->z, 0.0f };
 
     quaternion_rotate_active(&p, &q, &q_inv);
 
-    quaternion_to_vec(vec, &p);
+    vec->x = p.x;
+    vec->y = p.y;
+    vec->z = p.z;
 }
 
-void quaternion_rotate_passive(v4_f32* vec, float pitch, float yaw, float roll)
+void quaternion_rotate_active(v4_f32* quat, float pitch, float yaw, float roll)
 {
     v4_f32 q;
     quaternion_from_euler(&q, pitch, yaw, roll); // q is already in unit length
@@ -242,12 +271,35 @@ void quaternion_rotate_passive(v4_f32* vec, float pitch, float yaw, float roll)
     v4_f32 q_inv;
     quaternion_inverse(&q_inv, &q);
 
-    v4_f32 p;
-    quaternion_from_vec(&p, vec);
+    quaternion_rotate_active(quat, &q, &q_inv);
+}
+
+void quaternion_rotate_passive(v3_f32* vec, float pitch, float yaw, float roll)
+{
+    v4_f32 q;
+    quaternion_from_euler(&q, pitch, yaw, roll); // q is already in unit length
+
+    v4_f32 q_inv;
+    quaternion_inverse(&q_inv, &q);
+
+    v4_f32 p = { vec->x, vec->y, vec->z, 0.0f };
 
     quaternion_rotate_passive(&p, &q, &q_inv);
 
-    quaternion_to_vec(vec, &p);
+    vec->x = p.x;
+    vec->y = p.y;
+    vec->z = p.z;
+}
+
+void quaternion_rotate_passive(v4_f32* quat, float pitch, float yaw, float roll)
+{
+    v4_f32 q;
+    quaternion_from_euler(&q, pitch, yaw, roll); // q is already in unit length
+
+    v4_f32 q_inv;
+    quaternion_inverse(&q_inv, &q);
+
+    quaternion_rotate_passive(quat, &q, &q_inv);
 }
 
 #endif
\ No newline at end of file
diff --git a/math/matrix/VectorFloat32.h b/math/matrix/VectorFloat32.h
index a52b145..9f34188 100644
--- a/math/matrix/VectorFloat32.h
+++ b/math/matrix/VectorFloat32.h
@@ -151,23 +151,4 @@ struct v4_f32_16 {
     };
 };
 
-void vec3_normalize_f32(float* x, float* y, float* z)
-{
-    float d = sqrt((*x) * (*x) + (*y) * (*y) + (*z) * (*z));
-
-    *x /= d;
-    *y /= d;
-    *z /= d;
-}
-
-void vec4_normalize_f32(float* x, float* y, float* z, float* w)
-{
-    float d = sqrt((*x) * (*x) + (*y) * (*y) + (*z) * (*z) + (*w) * (*w));
-
-    *x /= d;
-    *y /= d;
-    *z /= d;
-    *w /= d;
-}
-
 #endif
diff --git a/models/item/Equipment.h b/models/item/Equipment.h
index e854f27..993f01d 100644
--- a/models/item/Equipment.h
+++ b/models/item/Equipment.h
@@ -25,6 +25,9 @@ struct SEquipmentStatsPoints {
     // Item requirements
     PrimaryStatsPoints requirements;
 
+    // @todo Find a way to add/multiply stats on conditions
+    //      e.g. x% or x amount of health/resource
+
     // Item stats
     // items cannot have stats like str, they can only modify primary stats of chars (see below)
     SecondaryStatsPoints secondary_item;
diff --git a/models/mob/skill/Skill.h b/models/mob/skill/Skill.h
index 3d88952..0e126e4 100644
--- a/models/mob/skill/Skill.h
+++ b/models/mob/skill/Skill.h
@@ -22,6 +22,12 @@ struct Skill
     // const char name[MAX_SKILL_NAME];
     // const char description[MAX_SKILL_DESCRIPTION];
 
+    // @todo implement charged skills
+    //      e.g. you gain one charge every: x seconds, x mob kills, x dmg, ...
+    //      max charges
+    //      you can then use these charges
+    //      -> we could then have things that also reduce charges
+
     int id;
 
     // @todo animations
diff --git a/platform/win32/UtilsWindows.h b/platform/win32/UtilsWindows.h
index 5d9babf..bd5f1f0 100644
--- a/platform/win32/UtilsWindows.h
+++ b/platform/win32/UtilsWindows.h
@@ -14,7 +14,87 @@
 #include "../../stdlib/Types.h"
 #include "../../utils/TestUtils.h"
 
-void window_create(Window* window, void* proc)
+inline
+void window_inactive(Window* w)
+{
+    LONG_PTR style = GetWindowLongPtrA(w->hwnd, GWL_STYLE);
+    style |= WS_OVERLAPPEDWINDOW;
+    SetWindowLongPtr(w->hwnd, GWL_STYLE, style);
+
+    ClipCursor(NULL);
+
+    // WARNING: Apparently this has an internal reference count, effecting if true/false actually take effect!
+    ShowCursor(true);
+
+    w->mouse_captured = false;
+}
+
+inline
+void monitor_resolution(const Window* __restrict w, v2_int32* __restrict resolution)
+{
+    resolution->width = GetDeviceCaps(w->hdc, HORZRES);
+    resolution->height = GetDeviceCaps(w->hdc, VERTRES);
+}
+
+inline
+void monitor_resolution(Window* __restrict w)
+{
+    w->width = GetDeviceCaps(w->hdc, HORZRES);
+    w->height = GetDeviceCaps(w->hdc, VERTRES);
+}
+
+inline
+void window_active(Window* __restrict w)
+{
+    LONG_PTR style = GetWindowLongPtrA(w->hwnd, GWL_STYLE);
+    style &= ~WS_OVERLAPPEDWINDOW;
+    SetWindowLongPtr(w->hwnd, GWL_STYLE, style);
+
+    SetWindowPos(
+        w->hwnd, HWND_TOP,
+        w->x, w->y,
+        w->width, w->height,
+        SWP_NOACTIVATE | SWP_NOZORDER
+    );
+
+    RECT rect;
+    GetWindowRect(w->hwnd, &rect);
+    ClipCursor(&rect);
+
+    // WARNING: Apparently this has an internal reference count, effecting if true/false actually take effect!
+    ShowCursor(false);
+
+    w->mouse_captured = true;
+}
+
+inline
+void window_fullscreen(Window* __restrict w)
+{
+    monitor_resolution(w);
+    w->x = 0;
+    w->y = 0;
+
+    LONG style = GetWindowLong(w->hwnd, GWL_STYLE);
+    SetWindowLongPtr(w->hwnd, GWL_STYLE, style & ~WS_OVERLAPPEDWINDOW);
+
+    SetWindowPos(w->hwnd, HWND_TOP, 0, 0, w->width, w->height, SWP_NOACTIVATE | SWP_NOZORDER | SWP_NOMOVE);
+}
+
+inline
+void window_restore(Window* __restrict w)
+{
+    window_restore_state(w);
+
+    SetWindowLongPtr(w->hwnd, GWL_STYLE, w->state_old.style);
+    SetWindowPos(
+        w->hwnd, HWND_TOP,
+        w->state_old.x, w->state_old.y,
+        w->state_old.width, w->state_old.height,
+        SWP_NOACTIVATE | SWP_NOZORDER
+    );
+}
+
+void window_create(Window* __restrict window, void* proc)
 {
     ASSERT_SIMPLE(proc);
 
@@ -26,6 +106,7 @@ void window_create(Window* window, void* proc)
     wc.style = CS_OWNDC;
     wc.lpfnWndProc = wndproc;
     wc.hInstance = hinstance;
+    wc.hCursor = LoadCursor(NULL, IDC_ARROW);
     wc.lpszClassName = (LPCSTR) window->name;
 
     if (!RegisterClassExA(&wc)) {
@@ -63,20 +144,17 @@ void window_create(Window* window, void* proc)
     window->hdc = GetDC(window->hwnd);
 
     ASSERT_SIMPLE(window->hwnd);
-
-    //SetWindowLongA(window->hwnd, GWL_STYLE, 0);
 }
 
-void window_open(const Window* window)
+void window_open(const Window* __restrict window)
 {
     ShowWindow(window->hwnd, SW_SHOW);
     SetForegroundWindow(window->hwnd);
 	SetFocus(window->hwnd);
-    ShowCursor(false);
     UpdateWindow(window->hwnd);
 }
 
-void window_close(Window* window)
+void window_close(Window* __restrict window)
 {
     CloseWindow(window->hwnd);
 }
diff --git a/platform/win32/Window.h b/platform/win32/Window.h
index 6b88da1..1ddc45d 100644
--- a/platform/win32/Window.h
+++ b/platform/win32/Window.h
@@ -12,17 +12,49 @@
 #include <windows.h>
 #include "../../stdlib/Types.h"
 
+struct WindowState {
+    uint64 style;
+    int32 width;
+    int32 height;
+
+    int32 x;
+    int32 y;
+};
+
 struct Window {
     bool is_fullscreen;
     int32 width;
     int32 height;
-    char name[32];
 
     int32 x;
     int32 y;
 
+    bool mouse_captured;
+
     HWND hwnd;
     HDC hdc;
+
+    char name[32];
+    WindowState state_old;
 };
 
+inline
+void window_backup_state(Window* __restrict w)
+{
+    w->state_old.style = GetWindowLongPtr(w->hwnd, GWL_STYLE);
+    w->state_old.width = w->width;
+    w->state_old.height = w->height;
+    w->state_old.x = w->x;
+    w->state_old.y = w->y;
+}
+
+inline
+void window_restore_state(Window* __restrict w)
+{
+    w->width = w->state_old.width;
+    w->height = w->state_old.height;
+    w->x = w->state_old.x;
+    w->y = w->state_old.y;
+}
+
 #endif
\ No newline at end of file
diff --git a/platform/win32/input/RawInput.h b/platform/win32/input/RawInput.h
index cd58f2d..621fdef 100644
--- a/platform/win32/input/RawInput.h
+++ b/platform/win32/input/RawInput.h
@@ -19,11 +19,17 @@
 #include "../../../memory/BufferMemory.h"
 #include <winDNS.h>
 
+#define INPUT_MOUSE_BUTTON_1 1
+#define INPUT_MOUSE_BUTTON_2 2
+#define INPUT_MOUSE_BUTTON_3 4
+#define INPUT_MOUSE_BUTTON_4 8
+#define INPUT_MOUSE_BUTTON_5 16
+
 // IMPORTANT:
 // Even if it is nowhere documented (at least not to our knowledge) the GetRawInputDeviceInfoA, GetRawInputBuffer functions requried
 // aligned memory. So far we only figured out that 4 bytes works, maybe this needs to be 8 in the future?!
 
-int input_init(HWND hwnd, InputState* states, RingMemory* ring, BufferMemory* buf)
+int input_init(HWND hwnd, Input* __restrict states, RingMemory* ring)
 {
     uint32 device_count;
     GetRawInputDeviceList(NULL, &device_count, sizeof(RAWINPUTDEVICELIST));
@@ -111,7 +117,16 @@ int input_init(HWND hwnd, InputState* states, RingMemory* ring, BufferMemory* bu
     return i;
 }
 
-void input_raw_handle(RAWINPUT* raw, InputState* states, int state_count)
+void input_mouse_position(HWND hwnd, v2_int32* pos)
+{
+    POINT p;
+    if (GetCursorPos(&p) && ScreenToClient(hwnd, &p)) {
+        pos->x = p.x;
+        pos->y = p.y;
+    }
+}
+
+void input_raw_handle(RAWINPUT* __restrict raw, Input* states, int state_count, uint64 time)
 {
     uint32 i = 0;
     if (raw->header.dwType == RIM_TYPEMOUSE) {
@@ -126,40 +141,93 @@ void input_raw_handle(RAWINPUT* raw, InputState* states, int state_count)
             return;
         }
 
-        InputState* input_state = states + i;
-
-        if (raw->data.mouse.usFlags & MOUSE_MOVE_ABSOLUTE) {
-            RECT rect;
-
-            // @todo move out, this is slow and should be stored in Window
-            // @performance this is slow and should be handled in the WindowProc !!!
-            if (raw->data.mouse.usFlags & MOUSE_VIRTUAL_DESKTOP) {
-                rect.left = GetSystemMetrics(SM_XVIRTUALSCREEN);
-                rect.top = GetSystemMetrics(SM_YVIRTUALSCREEN);
-                rect.right = GetSystemMetrics(SM_CXVIRTUALSCREEN);
-                rect.bottom = GetSystemMetrics(SM_CYVIRTUALSCREEN);
-            } else {
-                rect.left = 0;
-                rect.top = 0;
-                rect.right = GetSystemMetrics(SM_CXSCREEN);
-                rect.bottom = GetSystemMetrics(SM_CYSCREEN);
+        if (raw->data.mouse.usButtonFlags) {
+            // @question should all of these be else ifs?
+            if (raw->data.mouse.usButtonFlags & RI_MOUSE_LEFT_BUTTON_DOWN) {
+                states[i].state.mouse_down |= INPUT_MOUSE_BUTTON_1;
+                states[i].state.keys_down_time[0] = time;
+            } else if (raw->data.mouse.usButtonFlags & RI_MOUSE_LEFT_BUTTON_UP) {
+                states[i].state.mouse_down &= ~INPUT_MOUSE_BUTTON_1;
             }
 
-            input_state->x_last = input_state->x;
-            input_state->y_last = input_state->y;
+            if (raw->data.mouse.usButtonFlags & RI_MOUSE_RIGHT_BUTTON_DOWN) {
+                states[i].state.mouse_down |= INPUT_MOUSE_BUTTON_2;
+                states[i].state.keys_down_time[1] = time;
+            } else if (raw->data.mouse.usButtonFlags & RI_MOUSE_RIGHT_BUTTON_UP) {
+                states[i].state.mouse_down &= ~INPUT_MOUSE_BUTTON_2;
+            }
 
-            input_state->x = MulDiv(raw->data.mouse.lLastX, rect.right, 65535) + rect.left;
-            input_state->y = MulDiv(raw->data.mouse.lLastY, rect.bottom, 65535) + rect.top;
+            if (raw->data.mouse.usButtonFlags & RI_MOUSE_MIDDLE_BUTTON_DOWN) {
+                states[i].state.mouse_down |= INPUT_MOUSE_BUTTON_3;
+                states[i].state.keys_down_time[2] = time;
+            } else if (raw->data.mouse.usButtonFlags & RI_MOUSE_MIDDLE_BUTTON_UP) {
+                states[i].state.mouse_down &= ~INPUT_MOUSE_BUTTON_3;
+            }
 
-            input_state->state_change_mouse = true;
-        } else if (raw->data.mouse.lLastX != 0 || raw->data.mouse.lLastY != 0) {
-            input_state->x_last = input_state->x;
-            input_state->y_last = input_state->y;
+            if (raw->data.mouse.usButtonFlags & RI_MOUSE_BUTTON_4_DOWN) {
+                states[i].state.mouse_down |= INPUT_MOUSE_BUTTON_4;
+                states[i].state.keys_down_time[3] = time;
+            } else if (raw->data.mouse.usButtonFlags & RI_MOUSE_BUTTON_4_UP) {
+                states[i].state.mouse_down &= ~INPUT_MOUSE_BUTTON_4;
+            }
 
-            input_state->x = input_state->x + raw->data.mouse.lLastX;
-            input_state->y = input_state->y + raw->data.mouse.lLastY;
+            if (raw->data.mouse.usButtonFlags & RI_MOUSE_BUTTON_5_DOWN) {
+                states[i].state.mouse_down |= INPUT_MOUSE_BUTTON_5;
+                states[i].state.keys_down_time[4] = time;
+            } else if (raw->data.mouse.usButtonFlags & RI_MOUSE_BUTTON_5_UP) {
+                states[i].state.mouse_down &= ~INPUT_MOUSE_BUTTON_5;
+            }
 
-            input_state->state_change_mouse = true;
+            if (raw->data.mouse.usButtonFlags & RI_MOUSE_WHEEL) {
+                states[i].state.wheel_delta += raw->data.mouse.usButtonData;
+            }
+
+            if (raw->data.mouse.usButtonFlags & RI_MOUSE_HWHEEL) {
+                states[i].state.hwheel_delta += raw->data.mouse.usButtonData;
+            }
+
+            states[i].state_change_mouse = true;
+            states[i].state_change_mouse_button = true;
+
+            // @question is mouse wheel really considered a button change?
+            states[i].state_change_button = true;
+        }
+
+        if (states[i].mouse_movement) {
+            // do we want to handle mouse movement for every individual movement, or do we want to pull it
+            if (raw->data.mouse.usFlags & MOUSE_MOVE_ABSOLUTE) {
+                RECT rect;
+
+                // @todo move out, this is slow and should be stored in Window
+                // @performance this is slow and should be handled in the WindowProc !!!
+                if (raw->data.mouse.usFlags & MOUSE_VIRTUAL_DESKTOP) {
+                    rect.left = GetSystemMetrics(SM_XVIRTUALSCREEN);
+                    rect.top = GetSystemMetrics(SM_YVIRTUALSCREEN);
+                    rect.right = GetSystemMetrics(SM_CXVIRTUALSCREEN);
+                    rect.bottom = GetSystemMetrics(SM_CYVIRTUALSCREEN);
+                } else {
+                    rect.left = 0;
+                    rect.top = 0;
+                    rect.right = GetSystemMetrics(SM_CXSCREEN);
+                    rect.bottom = GetSystemMetrics(SM_CYSCREEN);
+                }
+
+                states[i].state.dx += raw->data.mouse.lLastX;
+                states[i].state.dy += raw->data.mouse.lLastY;
+
+                states[i].state.x = MulDiv(raw->data.mouse.lLastX, rect.right, 65535) + rect.left;
+                states[i].state.y = MulDiv(raw->data.mouse.lLastY, rect.bottom, 65535) + rect.top;
+
+                states[i].state_change_mouse = true;
+            } else if (raw->data.mouse.lLastX != 0 || raw->data.mouse.lLastY != 0) {
+                states[i].state.dx += raw->data.mouse.lLastX;
+                states[i].state.dy += raw->data.mouse.lLastY;
+
+                states[i].state.x = states[i].state.x + raw->data.mouse.lLastX;
+                states[i].state.y = states[i].state.y + raw->data.mouse.lLastY;
+
+                states[i].state_change_mouse = true;
+            }
         }
     } else if (raw->header.dwType == RIM_TYPEKEYBOARD) {
         // @todo Change so we can directly access the correct state (maybe map handle address to index?)
@@ -173,23 +241,69 @@ void input_raw_handle(RAWINPUT* raw, InputState* states, int state_count)
             return;
         }
 
-        InputState* input_state = states + i;
+        // @todo change to MakeCode instead of VKey
+        // @performance Some of the things down here seem unneccessary. We shouldn't have to loop all elements!
+        if (raw->data.keyboard.Flags == RI_KEY_BREAK) {
+            // Key is already released
+            if (keyboard_is_released(&states[i].state, (uint8) raw->data.keyboard.VKey)) {
+                for (int j = 0; j < MAX_KEY_PRESSES; ++j) {
+                    if (states[i].state.keys_down[j] == (uint8) raw->data.keyboard.VKey) {
+                        states[i].state.keys_down[j] = 0;
 
-        RAWKEYBOARD raw_kb =  raw->data.keyboard;
+                        break;
+                    }
+                }
 
-        if (raw_kb.Flags & RI_KEY_BREAK) {
-            input_state->keys_down_old[input_state->up_index++] = (uint8) raw_kb.VKey;
+                return;
+            }
+
+            bool empty = true;
+            for (int j = 0; j < MAX_KEY_PRESSES; ++j) {
+                if (empty && states[i].state.keys_up[j] == 0) {
+                    states[i].state.keys_up[j] = (uint8) raw->data.keyboard.VKey;
+
+                    empty = false;
+                }
+
+                // remove pressed key
+                if (states[i].state.keys_down[j] == (uint8) raw->data.keyboard.VKey) {
+                    states[i].state.keys_down[j] = 0;
+                }
+            }
+        } else if (raw->data.keyboard.Flags == RI_KEY_MAKE) {
+            // Key is already released
+            if (keyboard_is_pressed(&states[i].state, (uint8) raw->data.keyboard.VKey)) {
+                for (int j = 0; j < MAX_KEY_PRESSES; ++j) {
+                    if (states[i].state.keys_up[j] == (uint8) raw->data.keyboard.VKey) {
+                        states[i].state.keys_up[j] = 0;
+
+                        break;
+                    }
+                }
+
+                return;
+            }
+
+            bool empty = true;
+            for (int j = 0; j < MAX_KEY_PRESSES; ++j) {
+                if (empty && states[i].state.keys_down[j] == 0) {
+                    states[i].state.keys_down[j] = (uint8) raw->data.keyboard.VKey;
+                    states[i].state.keys_down_time[MAX_MOUSE_PRESSES + j] = time;
+                    empty = false;
+                }
+
+                // remove released key
+                if (states[i].state.keys_up[j] == (uint8) raw->data.keyboard.VKey) {
+                    states[i].state.keys_up[j] = 0;
+                }
+            }
         }
 
-        if (raw_kb.Flags & RI_KEY_MAKE) {
-            input_state->keys_down[input_state->down_index++] = (uint8) raw_kb.VKey;
-        }
-
-        input_state->state_change_keyboard = true;
+        states[i].state_change_button = true;
     }
 }
 
-void input_handle(LPARAM lParam, InputState* states, int state_count, RingMemory* ring)
+void input_handle(LPARAM lParam, Input* __restrict states, int state_count, RingMemory* ring, uint64 time)
 {
     uint32 db_size;
     GetRawInputData((HRAWINPUT) lParam, RID_INPUT, NULL, &db_size, sizeof(RAWINPUTHEADER));
@@ -203,13 +317,18 @@ void input_handle(LPARAM lParam, InputState* states, int state_count, RingMemory
         return;
     }
 
-    input_raw_handle((RAWINPUT *) lpb, states, state_count);
+    input_raw_handle((RAWINPUT *) lpb, states, state_count, time);
 }
 
-void input_handle_buffered(LPARAM lParam, int buffer_size, InputState* states, int state_count, RingMemory* ring)
+// @bug Somehow this function skips some inputs (input_handle works)!!!!!
+void input_handle_buffered(int buffer_size, Input* __restrict states, int state_count, RingMemory* ring, uint64 time)
 {
     uint32 cb_size;
+
     GetRawInputBuffer(NULL, &cb_size, sizeof(RAWINPUTHEADER));
+    if (!cb_size) {
+        return;
+    }
 
     // Max input messages (e.g. 16)
     cb_size *= buffer_size;
@@ -217,19 +336,28 @@ void input_handle_buffered(LPARAM lParam, int buffer_size, InputState* states, i
     PRAWINPUT raw_input = (PRAWINPUT) ring_get_memory(ring, cb_size, 4);
 
     uint32 input;
-    uint32 cb_size_t = cb_size;
 
-    while ((input = GetRawInputBuffer(raw_input, &cb_size_t, sizeof(RAWINPUTHEADER))) > 0) {
+    while (true) {
+        uint32 cb_size_t = cb_size;
+        input = GetRawInputBuffer(raw_input, &cb_size_t, sizeof(RAWINPUTHEADER));
+
+        if (input == 0 || input == (uint32) -1) {
+            break;
+        }
+
         PRAWINPUT pri = raw_input;
         for (uint32 i = 0; i < input; ++i) {
-            input_raw_handle(pri, states, state_count);
+            if (!pri->header.hDevice) {
+                break;
+            }
+
+            input_raw_handle(pri, states, state_count, time);
 
             pri = NEXTRAWINPUTBLOCK(pri);
         }
-
-        // @question is this asign necessary?
-        cb_size_t = cb_size;
     }
+
+    ASSERT_SIMPLE(input != (uint32) -1)
 }
 
 #endif
\ No newline at end of file
diff --git a/platform/win32/input/XInput.h b/platform/win32/input/XInput.h
index e6ac177..65dee55 100644
--- a/platform/win32/input/XInput.h
+++ b/platform/win32/input/XInput.h
@@ -55,7 +55,7 @@ void xinput_load() {
 }
 // END: Dynamically load XInput
 
-ControllerState* init_controllers()
+ControllerInput* init_controllers()
 {
     uint32 c = 0;
     for (uint32 controller_index = 0; controller_index < XUSER_MAX_COUNT; ++controller_index) {
@@ -67,7 +67,7 @@ ControllerState* init_controllers()
 
     // We always want at least one empty controller slot
     // @todo Change so that we store the actual number of devices
-    ControllerState *controllers = (ControllerState *) calloc((c + 1), sizeof(ControllerState));
+    ControllerInput *controllers = (ControllerInput *) calloc((c + 1), sizeof(ControllerInput));
 
     if (c == 0) {
         return controllers;
@@ -87,7 +87,7 @@ ControllerState* init_controllers()
     return controllers;
 }
 
-void handle_controller_input(ControllerState* states)
+void handle_controller_input(ControllerInput* states)
 {
     uint32 controller_index = 0;
     while(states[controller_index].is_connected) {
diff --git a/stdlib/Mathtypes.h b/stdlib/Mathtypes.h
index 7037ae8..9feca27 100644
--- a/stdlib/Mathtypes.h
+++ b/stdlib/Mathtypes.h
@@ -11,6 +11,8 @@
 
 #include "Types.h"
 
+// @todo Move to matrix
+
 struct v2_int32 {
     union {
         struct {
@@ -18,6 +20,11 @@ struct v2_int32 {
             int32 y;
         };
 
+        struct {
+            int32 width;
+            int32 height;
+        };
+
         int32 v[2];
     };
 };
diff --git a/utils/BitUtils.h b/utils/BitUtils.h
index 4d6307e..f76e059 100644
--- a/utils/BitUtils.h
+++ b/utils/BitUtils.h
@@ -12,6 +12,12 @@
 #include <intrin.h>
 #include "../stdlib/Types.h"
 
+#define IS_BIT_SET(num, pos) ((bool) ((num) & (1 << (pos))))
+#define BIT_SET(num, pos) ((num) | ((uint32) 1 << (pos)))
+#define BIT_UNSET(num, pos) ((num) & ~((uint32) 1 << (pos)))
+#define BIT_FLIP(num, pos) ((num) ^ ((uint32) 1 << (pos)))
+#define BIT_SET_TO(num, pos, x) ((num) & ~((uint32) 1 << (pos)) | ((uint32) (x) << (pos)))
+
 inline
 uint32 bytes_merge(byte b0, byte b1, byte b2, byte b3) {
     uint32 result = 0;
@@ -71,4 +77,64 @@ inline int find_first_set_bit(int value) {
     #endif
 }
 
+
+inline
+byte get_bits(byte data, int bits_to_read, int start_pos)
+{
+    byte mask = (1 << bits_to_read) - 1;
+    return (data >> (8 - start_pos - bits_to_read)) & mask;
+}
+
+inline
+uint64 get_bits(const byte* data, int bits_to_read, int start_pos)
+{
+    if (bits_to_read <= 0 || bits_to_read > sizeof(uint64)) {
+        return 0;
+    }
+
+    int byte_index = start_pos / 8;
+    int bit_offset = start_pos % 8;
+
+    uint64_t mask = (1ULL << bits_to_read) - 1;
+    uint64_t result = 0;
+
+    int bits_read = 0;
+
+    while (bits_read < bits_to_read) {
+        int bits_in_current_byte = 8 - bit_offset;
+        int bits_to_take = bits_to_read - bits_read;
+
+        if (bits_to_take > bits_in_current_byte) {
+            bits_to_take = bits_in_current_byte;
+        }
+
+        uint8_t current_byte = data[byte_index];
+        current_byte >>= bit_offset;
+        current_byte &= (1 << bits_to_take) - 1;
+
+        result |= ((uint64_t)current_byte << bits_read);
+
+        bits_read += bits_to_take;
+        bit_offset = 0;
+        byte_index++;
+    }
+
+    result &= mask;
+
+    return result;
+}
+
+inline
+uint32 reverse_bits(uint32 data, uint32 count)
+{
+    uint32 reversed = 0;
+    for (uint32 i = 0; i <= (count / 2); ++i) {
+        uint32 inv = count - i - 1;
+        reversed |= ((data >> i) & 0x1) << inv;
+        reversed |= ((data >> inv) & 0x1) << i;
+    }
+
+    return reversed;
+}
+
 #endif
\ No newline at end of file
diff --git a/utils/MathUtils.h b/utils/MathUtils.h
index eecc99f..e5714e0 100644
--- a/utils/MathUtils.h
+++ b/utils/MathUtils.h
@@ -29,88 +29,4 @@
 
 #define SQRT_2 1.4142135623730950488016887242097f
 
-// @question Consider to implement table based sine wave + approximation if necessary
-// [-PI/2, PI/2]
-inline
-float sin_approx_pih_pih(float x)
-{
-    return x - (x * x * x / 6.0f);
-}
-
-inline
-float sinf_approx(float x)
-{
-    return 4 * x * (180 - x) / (40500 - x * (180 - x));
-}
-
-inline
-float cosf_approx(float x)
-{
-    return sinf_approx(x + OMS_RAD2DEG(OMS_PI_OVER_TWO));
-}
-
-inline
-float tanf_approx(float x)
-{
-    float sin_x = sinf_approx(x);
-    float cos_x = cosf_approx(x);
-
-    if (cos_x == 0.0f) {
-        return (sin_x > 0.0f) ? 1e10f : -1e10f;
-    }
-
-    return sin_x / cos_x;
-}
-
-inline
-float atanf_approx(float x)
-{
-    float abs_x = OMS_ABS(x);
-    float result;
-
-    if (abs_x > 1.0f) {
-        result = OMS_PI_OVER_TWO - (1.0f / abs_x);
-    } else {
-        result = abs_x - (abs_x * abs_x * abs_x / 3.0f);
-    }
-
-    return (x < 0.0f) ? -result : result;
-}
-
-inline
-float atan2f_approx(float y, float x)
-{
-    float abs_y = (float) (OMS_ABS(y) + 1.175494e-038); // prevent division by zero
-    float angle;
-
-    if (x >= 0.0f) {
-        float r = (x - abs_y) / (x + abs_y);
-        angle = OMS_PI_OVER_FOUR - OMS_PI_OVER_FOUR * r;
-    } else {
-        float r = (x + abs_y) / (abs_y - x);
-        angle = (3.0f * OMS_PI / 4.0f) - OMS_PI_OVER_FOUR * r;
-    }
-
-    return (y < 0.0f) ? -angle : angle;
-}
-
-inline
-float asinf_approx(float x)
-{
-    float negate = (x < 0) ? 1.0f : 0.0f;
-    x = OMS_ABS(x);
-
-    float result = -0.0187293f;
-    result *= x;
-    result += 0.0742610f;
-    result *= x;
-    result -= 0.2121144f;
-    result *= x;
-    result += 1.5707288f;
-    result *= sqrtf(1.0f - x);
-    result -= 2 * negate * result;
-
-    return negate * OMS_PI + result;
-}
-
 #endif
diff --git a/utils/StringUtils.h b/utils/StringUtils.h
index 9ecbfec..89c4329 100644
--- a/utils/StringUtils.h
+++ b/utils/StringUtils.h
@@ -37,6 +37,27 @@ void wchar_to_char(const wchar_t* src, char* dest, int length = 0)
     *dest = '\0';
 }
 
+inline
+int str_to_int(const char *str)
+{
+    int result = 0;
+
+    int sign = 1;
+    if (*str == '-') {
+        sign = -1;
+        ++str;
+    }
+
+    while (*str >= '0' && *str <= '9') {
+        result *= 10;
+        result += (*str - '0');
+
+        ++str;
+    }
+
+    return result * sign;
+}
+
 inline size_t str_count(const char* str, const char* substr)
 {
     size_t l1 = strlen(str);
diff --git a/utils/TestUtils.h b/utils/TestUtils.h
index c5c2215..df12fa6 100644
--- a/utils/TestUtils.h
+++ b/utils/TestUtils.h
@@ -46,8 +46,10 @@ void update_timing_stat(TimingStat *stat)
 // In such cases use the following macro.
 #if DEBUG
     #define UPDATE_TIMING_STAT(stat) update_timing_stat(stat)
+    #define DEBUG_OUTPUT(str) OutputDebugStringA(str)
 #else
     #define UPDATE_TIMING_STAT(stat) ((void)0)
+    #define DEBUG_OUTPUT(str) ((void)0)
 #endif
 
 void profile_function(const char* func_name, void (*func)(void*), void* data, int iterations)
diff --git a/utils/Utils.h b/utils/Utils.h
index 27a0e90..2440d62 100644
--- a/utils/Utils.h
+++ b/utils/Utils.h
@@ -45,83 +45,6 @@ f32 fast_rand_percentage(void) {
     return (f32) fast_rand1() / (f32) FAST_RAND_MAX;
 }
 
-inline
-bool is_bit_set(byte data, int bit)
-{
-    return data & (1 << bit);
-}
-
-inline
-bool is_bit_set(int data, int bit)
-{
-    return data & (1 << bit);
-}
-
-inline
-bool is_bit_set(uint32 data, int bit)
-{
-    return data & (1 << bit);
-}
-
-inline
-byte get_bits(byte data, int bits_to_read, int start_pos)
-{
-    byte mask = (1 << bits_to_read) - 1;
-    return (data >> (8 - start_pos - bits_to_read)) & mask;
-}
-
-inline
-uint64 get_bits(const byte* data, int bits_to_read, int start_pos)
-{
-    if (bits_to_read <= 0 || bits_to_read > sizeof(uint64)) {
-        return 0;
-    }
-
-    int byte_index = start_pos / 8;
-    int bit_offset = start_pos % 8;
-
-    uint64_t mask = (1ULL << bits_to_read) - 1;
-    uint64_t result = 0;
-
-    int bits_read = 0;
-
-    while (bits_read < bits_to_read) {
-        int bits_in_current_byte = 8 - bit_offset;
-        int bits_to_take = bits_to_read - bits_read;
-
-        if (bits_to_take > bits_in_current_byte) {
-            bits_to_take = bits_in_current_byte;
-        }
-
-        uint8_t current_byte = data[byte_index];
-        current_byte >>= bit_offset;
-        current_byte &= (1 << bits_to_take) - 1;
-
-        result |= ((uint64_t)current_byte << bits_read);
-
-        bits_read += bits_to_take;
-        bit_offset = 0;
-        byte_index++;
-    }
-
-    result &= mask;
-
-    return result;
-}
-
-inline
-uint32 reverse_bits(uint32 data, uint32 count)
-{
-    uint32 reversed = 0;
-    for (uint32 i = 0; i <= (count / 2); ++i) {
-        uint32 inv = count - i - 1;
-        reversed |= ((data >> i) & 0x1) << inv;
-        reversed |= ((data >> inv) & 0x1) << i;
-    }
-
-    return reversed;
-}
-
 /**
  * Picks n random elements from end and stores them in begin.
  */