cOMS/utils/RegexSimplified.h
Dennis Eichhorn dc9f37b726
Some checks failed
CodeQL / Analyze (${{ matrix.language }}) (autobuild, c-cpp) (push) Has been cancelled
Microsoft C++ Code Analysis / Analyze (push) Has been cancelled
update
2025-04-06 10:34:47 +00:00

385 lines
11 KiB
C

/**
* Jingga
*
* @copyright Jingga
* @license OMS License 2.0
* @version 1.0.0
* @link https://jingga.app
*/
#ifndef COMS_UTILS_REGEX_SIMPLIFIED_H
#define COMS_UTILS_REGEX_SIMPLIFIED_H
#include "../stdlib/Types.h"
#include "StringUtils.h"
struct SimplifiedRegexParser {
const char *pattern;
int32 pos;
};
struct MatchResult {
bool matched;
int32 length;
};
static
void regex_skip_whitespace(SimplifiedRegexParser *parser) {
while (parser->pattern[parser->pos] == ' ') {
parser->pos++;
}
}
MatchResult regex_match_char(SimplifiedRegexParser *parser, const char *text) {
regex_skip_whitespace(parser);
MatchResult result = {false, 0};
if (parser->pattern[parser->pos] == '\0' || *text == '\0') {
return result;
}
if (parser->pattern[parser->pos] == '\\') {
// Handle escape sequences
parser->pos++;
if (parser->pattern[parser->pos] == 'd') {
if (str_is_num(*text)) {
result.matched = true;
result.length = 1;
parser->pos++;
}
} else if (parser->pattern[parser->pos] == *text) {
result.matched = true;
result.length = 1;
parser->pos++;
}
} else if (parser->pattern[parser->pos] == '.') {
// Match any character
result.matched = true;
result.length = 1;
parser->pos++;
} else if (parser->pattern[parser->pos] == *text) {
// Match literal character
result.matched = true;
result.length = 1;
parser->pos++;
} else if (parser->pattern[parser->pos] == 'a' &&
parser->pattern[parser->pos+1] == '-' &&
parser->pattern[parser->pos+2] == 'z') {
// Match a-z range
if (*text >= 'a' && *text <= 'z') {
result.matched = true;
result.length = 1;
parser->pos += 3;
}
} else if (parser->pattern[parser->pos] == 'A' &&
parser->pattern[parser->pos+1] == '-' &&
parser->pattern[parser->pos+2] == 'Z') {
// Match A-Z range
if (*text >= 'A' && *text <= 'Z') {
result.matched = true;
result.length = 1;
parser->pos += 3;
}
} else if (parser->pattern[parser->pos] == '0' &&
parser->pattern[parser->pos+1] == '-' &&
parser->pattern[parser->pos+2] == '9') {
// Match 0-9 range
if (str_is_num(*text)) {
result.matched = true;
result.length = 1;
parser->pos += 3;
}
}
return result;
}
int regex_parse_number(SimplifiedRegexParser *parser) {
int32 num = 0;
while (str_is_num(parser->pattern[parser->pos])) {
num = num * 10 + (parser->pattern[parser->pos] - '0');
parser->pos++;
}
return num;
}
MatchResult regex_match_pattern(SimplifiedRegexParser *parser, const char *text);
MatchResult regex_match_atom(SimplifiedRegexParser *parser, const char *text) {
regex_skip_whitespace(parser);
MatchResult result = {false, 0};
if (parser->pattern[parser->pos] == '(') {
// Handle group
int32 saved_pos = parser->pos;
parser->pos++;
result = regex_match_pattern(parser, text);
if (parser->pattern[parser->pos] == ')') {
parser->pos++;
} else {
// Group not properly closed, backtrack
parser->pos = saved_pos;
result = regex_match_char(parser, text);
}
} else {
// Handle single character
result = regex_match_char(parser, text);
}
return result;
}
MatchResult regex_match_repetition(SimplifiedRegexParser *parser, const char *text, MatchResult atom_result) {
MatchResult result = {false, 0};
parser->pos++; // Skip '{'
regex_skip_whitespace(parser);
int32 min = regex_parse_number(parser);
regex_skip_whitespace(parser);
int32 max = min;
if (parser->pattern[parser->pos] == ',') {
parser->pos++;
regex_skip_whitespace(parser);
if (parser->pattern[parser->pos] == '}') {
// {x,} means x or more (no max)
max = -1;
} else {
max = regex_parse_number(parser);
}
}
regex_skip_whitespace(parser);
if (parser->pattern[parser->pos] != '}') {
// Invalid repetition syntax
return result;
}
parser->pos++; // Skip '}'
if (min < 0 || (max != -1 && max < min)) {
// Invalid range
return result;
}
// Try to match exactly min times first
int32 count = 0;
int32 total_length = 0;
const char *current_text = text;
while (true) {
if (max != -1 && count >= max) break;
MatchResult next_result = regex_match_atom(parser, current_text);
if (!next_result.matched) break;
count++;
total_length += next_result.length;
current_text += next_result.length;
}
if (count >= min && (max == -1 || count <= max)) {
result.matched = true;
result.length = total_length;
}
return result;
}
MatchResult regex_match_element(SimplifiedRegexParser *parser, const char *text) {
MatchResult atom_result = regex_match_atom(parser, text);
if (!atom_result.matched) {
return atom_result;
}
regex_skip_whitespace(parser);
char quantifier = parser->pattern[parser->pos];
if (quantifier == '*') {
// Zero or more
parser->pos++;
int32 consumed = atom_result.length;
const char *remaining_text = text + consumed;
MatchResult star_result = {true, consumed};
while (true) {
MatchResult next_result = regex_match_atom(parser, remaining_text);
if (!next_result.matched) break;
consumed += next_result.length;
remaining_text += next_result.length;
star_result.length = consumed;
}
return star_result;
} else if (quantifier == '+') {
// One or more
parser->pos++;
int32 consumed = atom_result.length;
const char *remaining_text = text + consumed;
MatchResult plus_result = {true, consumed};
while (true) {
MatchResult next_result = regex_match_atom(parser, remaining_text);
if (!next_result.matched) break;
consumed += next_result.length;
remaining_text += next_result.length;
plus_result.length = consumed;
}
return plus_result;
} else if (quantifier == '?') {
// Zero or one
parser->pos++;
return atom_result;
} else if (quantifier == '{') {
// Min/max repetition {x,y}
return regex_match_repetition(parser, text, atom_result);
} else {
// No quantifier
return atom_result;
}
}
MatchResult regex_match_pattern(SimplifiedRegexParser *parser, const char *text) {
MatchResult result = regex_match_element(parser, text);
regex_skip_whitespace(parser);
if (parser->pattern[parser->pos] == '|') {
parser->pos++;
MatchResult alternative = regex_match_pattern(parser, text);
if (alternative.matched) {
return alternative;
}
}
return result;
}
bool regex_simplified_validate(const char* pattern, const char* text) {
SimplifiedRegexParser parser = {pattern, 0};
bool starts_with = false;
bool ends_with = false;
// Check for ^ and $ anchors
if (parser.pattern[parser.pos] == '^') {
starts_with = true;
parser.pos++;
}
MatchResult result = regex_match_pattern(&parser, text);
if (parser.pattern[parser.pos] == '$') {
ends_with = true;
parser.pos++;
}
// Check if we consumed the entire pattern
if (parser.pattern[parser.pos] != '\0') {
return false;
}
// Check anchors
if (starts_with && ends_with) {
return result.matched && (result.length == str_length(text));
} else if (starts_with) {
return result.matched && (result.length > 0);
} else if (ends_with) {
return result.matched && (text[result.length] == '\0');
} else {
return result.matched;
}
}
/*
// Test function
void test_regex(const char *pattern, const char *text, bool expected) {
bool result = regex_simplified_validate(pattern, text);
printf("Pattern: '%-10s'\tText: '%-6s'\tExpected: %-5s\tActual: %-5s\t%s\n",
pattern, text, expected ? "true" : "false", result ? "true" : "false",
(result == expected) ? "✓" : "✗");
}
int main() {
// Test cases
printf("Enhanced Regex Validator Tests\n");
printf("=============================\n");
// Basic tests
test_regex("abc", "abc", true);
test_regex("^abc$", "abc", true);
test_regex("^abc$", "abcd", false);
// Character classes
test_regex("a-z", "a", true);
test_regex("a-z", "z", true);
test_regex("a-z", "A", false);
test_regex("A-Z", "Z", true);
test_regex("A-Z", "a", false);
test_regex("0-9", "5", true);
test_regex("0-9", "a", false);
test_regex("\\d", "5", true);
test_regex("\\d", "a", false);
// Quantifiers
test_regex("a*", "", true);
test_regex("a*", "a", true);
test_regex("a*", "aaa", true);
test_regex("a+", "", false);
test_regex("a+", "a", true);
test_regex("a+", "aaa", true);
test_regex("a?b", "b", true);
test_regex("a?b", "ab", true);
test_regex("a?b", "aab", false);
// Groups and alternation
test_regex("(a|b)c", "ac", true);
test_regex("(a|b)c", "bc", true);
test_regex("(a|b)c", "cc", false);
test_regex("(a-z)+", "abc", true);
test_regex("(A-Z)+", "ABC", true);
test_regex("(0-9)+", "123", true);
// Escape sequences
test_regex("\\.", ".", true);
test_regex("\\.", "a", false);
test_regex("a\\db", "a0b", true);
test_regex("a\\db", "a9b", true);
test_regex("a\\db", "aab", false);
// Any character
test_regex("a.b", "a b", true);
test_regex("a.b", "a0b", true);
test_regex("a.b", "a\nb", true);
test_regex("a.b", "ab", false);
// Repetition tests
test_regex("a{2}", "aa", true);
test_regex("a{2}", "a", false);
test_regex("a{2}", "aaa", true); // More than min is allowed
test_regex("a{2,4}", "aa", true);
test_regex("a{2,4}", "aaa", true);
test_regex("a{2,4}", "aaaa", true);
test_regex("a{2,4}", "a", false);
test_regex("a{2,4}", "aaaaa", false);
test_regex("a{2,}", "aa", true);
test_regex("a{2,}", "aaaaa", true);
test_regex("a{2,}", "a", false);
test_regex("(a-z){3}", "abc", true);
test_regex("(a-z){3}", "ab", false);
test_regex("(a-z){2,4}", "ab", true);
test_regex("(a-z){2,4}", "abcd", true);
test_regex("(a-z){2,4}", "abcde", false);
test_regex("\\d{3}-\\d{2}", "123-45", true);
test_regex("\\d{3}-\\d{2}", "12-345", false);
// Combined tests
test_regex("^a{2}b{1,3}c$", "aabbc", true);
test_regex("^a{2}b{1,3}c$", "aabbbc", true);
test_regex("^a{2}b{1,3}c$", "aabc", true);
test_regex("^a{2}b{1,3}c$", "aabbbbc", false);
test_regex("^a{2}b{1,3}c$", "abbc", false);
return 0;
}
*/
#endif