From 1cb7a1ed7716ed9dc4248464524430c08b49ffb2 Mon Sep 17 00:00:00 2001 From: Dennis Eichhorn Date: Wed, 7 Oct 2020 20:57:08 +0200 Subject: [PATCH] Add search and comparison functionality --- System/Search/StringSearch.php | 186 +++++++++++++++++++++++ Utils/MbStringUtils.php | 4 - Utils/StringCompare.php | 77 ++++++++++ Utils/StringUtils.php | 4 - tests/System/CharsetTypeTest.php | 54 +++++++ tests/System/Search/StringSearchTest.php | 55 +++++++ tests/Utils/StringCompareTest.php | 14 ++ 7 files changed, 386 insertions(+), 8 deletions(-) create mode 100644 System/Search/StringSearch.php create mode 100644 tests/System/CharsetTypeTest.php create mode 100644 tests/System/Search/StringSearchTest.php diff --git a/System/Search/StringSearch.php b/System/Search/StringSearch.php new file mode 100644 index 000000000..bd7ef94f9 --- /dev/null +++ b/System/Search/StringSearch.php @@ -0,0 +1,186 @@ += $patternSize) { + return $i; + } + } + + if ($j > 0) { + $i += $shift[$j - 1]; + $j = \max($j - $shift[$j - 1], 0); + } else { + ++$i; + $j = 0; + } + } + + return -1; + } + + /** + * Create shift array + * + * @param string $pattern Pattern + * + * @return int[] + * + * @since 1.0.0 + */ + private static function knuthMorrisPrattShift(string $pattern) : array + { + $patternSize = \strlen($pattern); + $shift = []; + $shift[] = 1; + + $i = 1; + $j = 0; + while ($i + $j < $patternSize) { + if ($pattern[$i + $j] === $pattern[$j]) { + $shift[$i + $j] = $i; + ++$j; + } else { + if ($j === 0) { + $shift[$i] = $i + 1; + } + + if ($j > 0) { + $i += $shift[$j - 1]; + $j = \max($j - $shift[$j - 1], 0); + } else { + ++$i; + $j = 0; + } + } + } + + return $shift; + } + + /** + * Find pattern in string + * + * @param string $pattern Pattern + * @param string $text Text to search in + * + * @return int Match position + * + * @since 1.0.0 + */ + public static function boyerMooreHorspoolSimpleSearch(string $pattern, string $text) : int + { + $patternSize = \strlen($pattern); + $textSize = \strlen($text); + + $i = 0; + $j = 0; + while ($i + $patternSize <= $textSize) { + $j = $patternSize - 1; + + while ($text[$i + $j] === $pattern[$j]) { + --$j; + if ($j < 0) { + return $i; + } + } + + ++$i; + } + + return -1; + } + + /** + * Find pattern in string + * + * @param string $pattern Pattern + * @param string $text Text to search in + * + * @return int Match position + * + * @since 1.0.0 + */ + public static function boyerMooreHorspoolSearch(string $pattern, string $text) : int + { + $patternSize = \strlen($pattern); + $textSize = \strlen($text); + + $shift = []; + for ($k = 0; $k < 256; ++$k) { + $shift[$k] = $patternSize; + } + + for ($k = 0; $k < $patternSize - 1; ++$k) { + $shift[\ord($pattern[$k])] = $patternSize - 1 - $k; + } + + $i = 0; + $j = 0; + while ($i + $patternSize <= $textSize) { + $j = $patternSize - 1; + + while ($text[$i + $j] === $pattern[$j]) { + --$j; + if ($j < 0) { + return $i; + } + } + + $i += $shift[\ord($text[$i + $patternSize - 1])]; + } + + return -1; + } +} diff --git a/Utils/MbStringUtils.php b/Utils/MbStringUtils.php index f3d3b09ad..519a6809e 100644 --- a/Utils/MbStringUtils.php +++ b/Utils/MbStringUtils.php @@ -26,10 +26,6 @@ use phpOMS\System\CharsetType; * @link https://orange-management.org * @since 1.0.0 * - * @todo Orange-Management/phpOMS#119 - * Create jaro winkler distance - * https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance - * * @SuppressWarnings(PHPMD.CamelCaseMethodName) */ final class MbStringUtils diff --git a/Utils/StringCompare.php b/Utils/StringCompare.php index f17ee1b9b..aed4220b8 100644 --- a/Utils/StringCompare.php +++ b/Utils/StringCompare.php @@ -86,6 +86,83 @@ final class StringCompare return $bestMatch; } + /** + * Jaro string distance + * + * @param string $s1 String1 + * @param string $s2 String2 + * + * @return float + * + * @since 1.0.0 + */ + public static function jaro(string $s1, string $s2) : float + { + $s1Size = \strlen($s1); + $s2Size = \strlen($s2); + + if ($s1Size === 0) { + return $s2Size === 0 ? 1.0 : 0.0; + } + + $mDistance = (int) (\max($s1Size, $s2Size) / 2 - 1); + + $matches = 0; + $transpositions = 0.0; + + $s1Matches = []; + $s2Matches = []; + + for ($i = 0; $i < $s1Size; ++$i) { + $start = \max(0, $i - $mDistance); + $end = \min($i + $mDistance + 1, $s2Size); + + for ($j = $start; $j < $end; ++$j) { + if (isset($s2Matches[$j])) { + continue; + } + + if ($s1[$i] !== $s2[$j]) { + continue; + } + + $s1Matches[$i] = true; + $s2Matches[$j] = true; + + ++$matches; + break; + } + } + + if ($matches === 0) { + return 0.0; + } + + $j = 0; + for ($i = 0; $i < $s1Size; ++$i) { + if (!isset($s1Matches[$i])) { + continue; + } + + while (!isset($s2Matches[$j])) { + ++$j; + } + + if ($s1[$i] !== $s2[$j]) { + ++$transpositions; + } + + ++$j; + } + + $transpositions /= 2.0; + + return ($matches / $s1Size + + $matches / $s2Size + + ($matches - $transpositions) / $matches) + / 3.0; + } + /** * Calculate word match score. * diff --git a/Utils/StringUtils.php b/Utils/StringUtils.php index 40946959f..e5d498c9d 100644 --- a/Utils/StringUtils.php +++ b/Utils/StringUtils.php @@ -26,10 +26,6 @@ use phpOMS\Contract\RenderableInterface; * @link https://orange-management.org * @since 1.0.0 * - * @todo Orange-Management/phpOMS#119 - * Create jaro winkler distance - * https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance - * * @SuppressWarnings(PHPMD.CamelCaseMethodName) */ final class StringUtils diff --git a/tests/System/CharsetTypeTest.php b/tests/System/CharsetTypeTest.php new file mode 100644 index 000000000..1f85abc48 --- /dev/null +++ b/tests/System/CharsetTypeTest.php @@ -0,0 +1,54 @@ +