split string utils and mb_string utils

This commit is contained in:
Dennis Eichhorn 2020-04-11 21:31:28 +02:00
parent 12a4880f16
commit 22a63a2c32
4 changed files with 477 additions and 379 deletions

340
Utils/MbStringUtils.php Normal file
View File

@ -0,0 +1,340 @@
<?php
/**
* Orange Management
*
* PHP Version 7.4
*
* @package phpOMS\Utils
* @copyright Dennis Eichhorn
* @license OMS License 1.0
* @version 1.0.0
* @link https://orange-management.org
*/
declare(strict_types=1);
namespace phpOMS\Utils;
use phpOMS\System\CharsetType;
/**
* String utils class.
*
* This class provides static helper functionalities for strings.
*
* @package phpOMS\Utils
* @license OMS License 1.0
* @link https://orange-management.org
* @since 1.0.0
*
* @todo Orange-Management/phpOMS#119
* Create jaro winkler distance
* https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance
*
* @SuppressWarnings(PHPMD.CamelCaseMethodName)
*/
final class MbStringUtils
{
/**
* Constructor.
*
* This class is purely static and is preventing any initialization
*
* @since 1.0.0
* @codeCoverageIgnore
*/
private function __construct()
{
}
/**
* Check if a string contains any of the provided needles (case sensitive).
*
* The validation is done case sensitive.
*
* @param string $haystack Haystack
* @param string[] $needles Needles to check if any of them are part of the haystack
*
* @example MbStringUtils::mb_contains('This string', ['This', 'test']); // true
*
* @return bool the function returns true if any of the needles is part of the haystack, false otherwise
*
* @since 1.0.0
*/
public static function mb_contains(string $haystack, array $needles) : bool
{
foreach ($needles as $needle) {
if (\mb_strpos($haystack, $needle) !== false) {
return true;
}
}
return false;
}
/**
* Tests if a multi byte string starts with a certain string (case sensitive).
*
* The validation is done case sensitive. The function takes strings or an array of strings for the validation.
* In case of an array the function will test if any of the needles is at the beginning of the haystack string.
*
* @param string $haystack Haystack
* @param array|string $needles needles to check if they are at the beginning of the haystack
*
* @return bool the function returns true if any of the needles is at the beginning of the haystack, false otherwise
*
* @since 1.0.0
*/
public static function mb_startsWith(string $haystack, $needles) : bool
{
if (\is_string($needles)) {
$needles = [$needles];
}
foreach ($needles as $needle) {
if ($needle === '' || \mb_strrpos($haystack, $needle, -\mb_strlen($haystack)) !== false) {
return true;
}
}
return false;
}
/**
* Tests if a multi byte string ends with a certain string (case sensitive).
*
* The validation is done case sensitive. The function takes strings or an array of strings for the validation.
* In case of an array the function will test if any of the needles is at the end of the haystack string.
*
* @param string $haystack Haystack
* @param array|string $needles needles to check if they are at the end of the haystack
*
* @example StringUtils::endsWith('Test string', ['test1', 'string']); // true
* @example StringUtils::endsWith('Test string', 'string'); // true
* @example StringUtils::endsWith('Test string', String); // false
*
* @return bool the function returns true if any of the needles is at the end of the haystack, false otherwise
*
* @since 1.0.0
*/
public static function mb_endsWith(string $haystack, $needles) : bool
{
if (\is_string($needles)) {
$needles = [$needles];
}
foreach ($needles as $needle) {
if ($needle === '' || (($temp = \mb_strlen($haystack) - \mb_strlen($needle)) >= 0 && \mb_strpos($haystack, $needle, $temp) !== false)) {
return true;
}
}
return false;
}
/**
* Makes first letter of a multi byte string upper case.
*
* @param string $string string to upper case first letter
*
* @return string multi byte string with first character as upper case
*
* @since 1.0.0
*/
public static function mb_ucfirst(string $string) : string
{
$strlen = \mb_strlen($string);
$firstChar = \mb_substr($string, 0, 1);
$then = \mb_substr($string, 1, $strlen - 1);
return \mb_strtoupper($firstChar) . $then;
}
/**
* Makes first letter of a multi byte string lower case.
*
* @param string $string string to lower case first letter
*
* @return string multi byte string with first character as lower case
*
* @since 1.0.0
*/
public static function mb_lcfirst(string $string) : string
{
$strlen = \mb_strlen($string);
$firstChar = \mb_substr($string, 0, 1);
$then = \mb_substr($string, 1, $strlen - 1);
return \mb_strtolower($firstChar) . $then;
}
/**
* Trim multi byte characters from a multi byte string.
*
* @param string $string multi byte string to trim multi byte characters from
* @param string $charlist Multi byte character list used for trimming
*
* @return string trimmed multi byte string
*
* @since 1.0.0
*/
public static function mb_trim(string $string, string $charlist = ' ') : string
{
if ($charlist === ' ') {
return \trim($string);
} else {
$charlist = \str_replace('/', '\/', \preg_quote($charlist));
return \preg_replace('/(^[' . $charlist . ']+)|([ ' . $charlist . ']+$)/us', '', $string) ?? '';
}
}
/**
* Trim multi byte characters from the right of a multi byte string.
*
* @param string $string multi byte string to trim multi byte characters from
* @param string $charlist Multi byte character list used for trimming
*
* @return string trimmed multi byte string
*
* @since 1.0.0
*/
public static function mb_rtrim(string $string, string $charlist = ' ') : string
{
if ($charlist === ' ') {
return \rtrim($string);
} else {
$charlist = \str_replace('/', '\/', \preg_quote($charlist));
return \preg_replace('/([' . $charlist . ']+$)/us', '', $string) ?? '';
}
}
/**
* Trim multi byte characters from the left of a multi byte string.
*
* @param string $string multi byte string to trim multi byte characters from
* @param string $charlist Multi byte character list used for trimming
*
* @return string trimmed multi byte string
*
* @since 1.0.0
*/
public static function mb_ltrim(string $string, string $charlist = ' ') : string
{
if ($charlist === ' ') {
return \ltrim($string);
} else {
$charlist = \str_replace('/', '\/', \preg_quote($charlist));
return \preg_replace('/(^[' . $charlist . ']+)/us', '', $string) ?? '';
}
}
/**
* Calculate string entropy
*
* @param string $value string to analyze
*
* @return float
*
* @since 1.0.0
*/
public static function mb_entropy(string $value) : float
{
$entroy = 0.0;
$size = \mb_strlen($value);
$countChars = self::mb_count_chars($value);
foreach ($countChars as $v) {
$p = $v / $size;
$entroy -= $p * \log($p) / \log(2);
}
return $entroy;
}
/**
* Count chars of utf-8 string.
*
* @param string $input string to count chars
*
* @return array<string, int>
*
* @since 1.0.0
*/
public static function mb_count_chars(string $input) : array
{
$l = \mb_strlen($input, 'UTF-8');
$unique = [];
for ($i = 0; $i < $l; ++$i) {
$char = \mb_substr($input, $i, 1, 'UTF-8');
if (!\array_key_exists($char, $unique)) {
$unique[$char] = 0;
}
++$unique[$char];
}
return $unique;
}
/**
* Get the utf-8 boundary of a string
*
* @param string $text To search for utf-8 boundary
* @param int $offset Search offset
*
* @return int
*
* @since 1.0.0
*/
public static function utf8CharBoundary(string $text, int $offset = 0) : int
{
$reset = 3;
$pos = $offset;
do {
$lastChunk = \substr($text, $pos - $reset, $reset);
$encodedPos = \strpos($lastChunk, '=');
if ($encodedPos === false) {
break;
}
$hex = \substr($text, $pos - $reset + $encodedPos + 1, 2);
$dec = \hexdec($hex);
if ($dec < 128) {
if ($encodedPos > 0) {
$pos -= $reset - $encodedPos;
}
break;
} elseif ($dec >= 192) {
$pos -= $reset - $encodedPos;
break;
} elseif ($dec < 192) {
$reset += 3;
}
} while (true);
return $pos;
}
/**
* Test if a string has multibytes
*
* @param string $text Text to check
* @param string $charset Charset to check
*
* @return bool
*
* @since 1.0.0
*/
public static function hasMultiBytes(string $text, string $charset = CharsetType::UTF_8) : bool
{
return \strlen($text) > \mb_strlen($text, $charset);
}
}

View File

@ -72,31 +72,6 @@ final class StringUtils
return false;
}
/**
* Check if a string contains any of the provided needles (case sensitive).
*
* The validation is done case sensitive.
*
* @param string $haystack Haystack
* @param string[] $needles Needles to check if any of them are part of the haystack
*
* @example StringUtils::mb_contains('This string', ['This', 'test']); // true
*
* @return bool the function returns true if any of the needles is part of the haystack, false otherwise
*
* @since 1.0.0
*/
public static function mb_contains(string $haystack, array $needles) : bool
{
foreach ($needles as $needle) {
if (\mb_strpos($haystack, $needle) !== false) {
return true;
}
}
return false;
}
/**
* Tests if a string ends with a certain string (case sensitive).
*
@ -159,165 +134,6 @@ final class StringUtils
return false;
}
/**
* Tests if a multi byte string starts with a certain string (case sensitive).
*
* The validation is done case sensitive. The function takes strings or an array of strings for the validation.
* In case of an array the function will test if any of the needles is at the beginning of the haystack string.
*
* @param string $haystack Haystack
* @param array|string $needles needles to check if they are at the beginning of the haystack
*
* @return bool the function returns true if any of the needles is at the beginning of the haystack, false otherwise
*
* @since 1.0.0
*/
public static function mb_startsWith(string $haystack, $needles) : bool
{
if (\is_string($needles)) {
$needles = [$needles];
}
foreach ($needles as $needle) {
if ($needle === '' || \mb_strrpos($haystack, $needle, -\mb_strlen($haystack)) !== false) {
return true;
}
}
return false;
}
/**
* Tests if a multi byte string ends with a certain string (case sensitive).
*
* The validation is done case sensitive. The function takes strings or an array of strings for the validation.
* In case of an array the function will test if any of the needles is at the end of the haystack string.
*
* @param string $haystack Haystack
* @param array|string $needles needles to check if they are at the end of the haystack
*
* @example StringUtils::endsWith('Test string', ['test1', 'string']); // true
* @example StringUtils::endsWith('Test string', 'string'); // true
* @example StringUtils::endsWith('Test string', String); // false
*
* @return bool the function returns true if any of the needles is at the end of the haystack, false otherwise
*
* @since 1.0.0
*/
public static function mb_endsWith(string $haystack, $needles) : bool
{
if (\is_string($needles)) {
$needles = [$needles];
}
foreach ($needles as $needle) {
if ($needle === '' || (($temp = \mb_strlen($haystack) - \mb_strlen($needle)) >= 0 && \mb_strpos($haystack, $needle, $temp) !== false)) {
return true;
}
}
return false;
}
/**
* Makes first letter of a multi byte string upper case.
*
* @param string $string string to upper case first letter
*
* @return string multi byte string with first character as upper case
*
* @since 1.0.0
*/
public static function mb_ucfirst(string $string) : string
{
$strlen = \mb_strlen($string);
$firstChar = \mb_substr($string, 0, 1);
$then = \mb_substr($string, 1, $strlen - 1);
return \mb_strtoupper($firstChar) . $then;
}
/**
* Makes first letter of a multi byte string lower case.
*
* @param string $string string to lower case first letter
*
* @return string multi byte string with first character as lower case
*
* @since 1.0.0
*/
public static function mb_lcfirst(string $string) : string
{
$strlen = \mb_strlen($string);
$firstChar = \mb_substr($string, 0, 1);
$then = \mb_substr($string, 1, $strlen - 1);
return \mb_strtolower($firstChar) . $then;
}
/**
* Trim multi byte characters from a multi byte string.
*
* @param string $string multi byte string to trim multi byte characters from
* @param string $charlist Multi byte character list used for trimming
*
* @return string trimmed multi byte string
*
* @since 1.0.0
*/
public static function mb_trim(string $string, string $charlist = ' ') : string
{
if ($charlist === ' ') {
return \trim($string);
} else {
$charlist = \str_replace('/', '\/', \preg_quote($charlist));
return \preg_replace('/(^[' . $charlist . ']+)|([ ' . $charlist . ']+$)/us', '', $string) ?? '';
}
}
/**
* Trim multi byte characters from the right of a multi byte string.
*
* @param string $string multi byte string to trim multi byte characters from
* @param string $charlist Multi byte character list used for trimming
*
* @return string trimmed multi byte string
*
* @since 1.0.0
*/
public static function mb_rtrim(string $string, string $charlist = ' ') : string
{
if ($charlist === ' ') {
return \rtrim($string);
} else {
$charlist = \str_replace('/', '\/', \preg_quote($charlist));
return \preg_replace('/([' . $charlist . ']+$)/us', '', $string) ?? '';
}
}
/**
* Trim multi byte characters from the left of a multi byte string.
*
* @param string $string multi byte string to trim multi byte characters from
* @param string $charlist Multi byte character list used for trimming
*
* @return string trimmed multi byte string
*
* @since 1.0.0
*/
public static function mb_ltrim(string $string, string $charlist = ' ') : string
{
if ($charlist === ' ') {
return \ltrim($string);
} else {
$charlist = \str_replace('/', '\/', \preg_quote($charlist));
return \preg_replace('/(^[' . $charlist . ']+)/us', '', $string) ?? '';
}
}
/**
* Count occurences of character at the beginning of a string.
*
@ -356,11 +172,11 @@ final class StringUtils
*
* @since 1.0.0
*/
public static function getEntropy(string $value) : float
public static function entropy(string $value) : float
{
$entroy = 0.0;
$size = \mb_strlen($value);
$countChars = self::mb_count_chars($value);
$size = \strlen($value);
$countChars = \count_chars($value);
foreach ($countChars as $v) {
$p = $v / $size;
@ -370,33 +186,6 @@ final class StringUtils
return $entroy;
}
/**
* Count chars of utf-8 string.
*
* @param string $input string to count chars
*
* @return array<string, int>
*
* @since 1.0.0
*/
public static function mb_count_chars(string $input) : array
{
$l = \mb_strlen($input, 'UTF-8');
$unique = [];
for ($i = 0; $i < $l; ++$i) {
$char = \mb_substr($input, $i, 1, 'UTF-8');
if (!\array_key_exists($char, $unique)) {
$unique[$char] = 0;
}
++$unique[$char];
}
return $unique;
}
/**
* Turn value into string
*
@ -589,62 +378,4 @@ final class StringUtils
return ['values' => $diffValues, 'mask' => $diffMask];
}
/**
* Get the utf-8 boundary of a string
*
* @param string $text To search for utf-8 boundary
* @param int $offset Search offset
*
* @return int
*
* @since 1.0.0
*/
public static function utf8CharBoundary(string $text, int $offset = 0) : int
{
$reset = 3;
$pos = $offset;
do {
$lastChunk = \substr($text, $pos - $reset, $reset);
$encodedPos = \strpos($lastChunk, '=');
if ($encodedPos === false) {
break;
}
$hex = \substr($text, $pos - $reset + $encodedPos + 1, 2);
$dec = \hexdec($hex);
if ($dec < 128) {
if ($encodedPos > 0) {
$pos -= $reset - $encodedPos;
}
break;
} elseif ($dec >= 192) {
$pos -= $reset - $encodedPos;
break;
} elseif ($dec < 192) {
$reset +=3;
}
} while(true);
return $pos;
}
/**
* Test if a string has multibytes
*
* @param string $text Text to check
* @param string $charset Charset to check
*
* @return bool
*
* @since 1.0.0
*/
public static function hasMultiBytes(string $text, string $charset = CharsetType::UTF_8) : bool
{
return \strlen($text) > \mb_strlen($text, $charset);
}
}

View File

@ -0,0 +1,133 @@
<?php
/**
* Orange Management
*
* PHP Version 7.4
*
* @package tests
* @copyright Dennis Eichhorn
* @license OMS License 1.0
* @version 1.0.0
* @link https://orange-management.org
*/
declare(strict_types=1);
namespace phpOMS\tests\Utils;
use phpOMS\Utils\MbStringUtils;
require_once __DIR__ . '/../Autoloader.php';
/**
* @testdox phpOMS\tests\Utils\MbStringUtilsTest: String utilities
*
* @internal
*/
class MbStringUtilsTest extends \PHPUnit\Framework\TestCase
{
/**
* @testdox A multi-byte string can be checked if it starts with a defined string
* @covers phpOMS\Utils\MbStringUtils
* @group framework
*/
public function testStartsMb() : void
{
$string = 'This is a test string.';
self::assertTrue(MbStringUtils::mb_startsWith($string, 'This '));
self::assertFalse(MbStringUtils::mb_startsWith($string, 'Thss '));
}
/**
* @testdox A multi-byte string can be checked if it ends with a defined string
* @covers phpOMS\Utils\MbStringUtils
* @group framework
*/
public function testEndsMb() : void
{
$string = 'This is a test string.';
self::assertTrue(MbStringUtils::mb_endsWith($string, 'string.'));
self::assertFalse(MbStringUtils::mb_endsWith($string, 'strng.'));
}
/**
* @testdox The first character of a multi-byte string can be turned into upper case
* @covers phpOMS\Utils\MbStringUtils
* @group framework
*/
public function testTransformUpperCase() : void
{
self::assertEquals('This ', MbStringUtils::mb_ucfirst('this '));
self::assertNotEquals('this ', MbStringUtils::mb_ucfirst('this '));
}
/**
* @testdox The first character of a multi-byte string can be turned into lower case
* @covers phpOMS\Utils\MbStringUtils
* @group framework
*/
public function testTransformLowerCase() : void
{
self::assertEquals('thss', MbStringUtils::mb_lcfirst('Thss'));
self::assertNotEquals('Thss', MbStringUtils::mb_lcfirst('Thss'));
}
/**
* @testdox A multi-byte string can be trimmed
* @covers phpOMS\Utils\MbStringUtils
* @group framework
*/
public function testTrim() : void
{
$string = 'This is a test string.';
self::assertEquals($string, MbStringUtils::mb_trim($string, ' '));
self::assertEquals('This is a test string', MbStringUtils::mb_trim($string, '.'));
self::assertEquals('asdf', MbStringUtils::mb_trim(' asdf ', ' '));
self::assertEquals('asdf', MbStringUtils::mb_trim('%asdf%', '%'));
}
/**
* @testdox A multi-byte string can be right-trimmed
* @covers phpOMS\Utils\MbStringUtils
* @group framework
*/
public function testRTrim() : void
{
self::assertEquals(' asdf', MbStringUtils::mb_rtrim(' asdf '));
self::assertEquals('%asdf', MbStringUtils::mb_rtrim('%asdf%', '%'));
}
/**
* @testdox A multi-byte string can be left-trimmed
* @covers phpOMS\Utils\MbStringUtils
* @group framework
*/
public function testLTrim() : void
{
self::assertEquals('asdf ', MbStringUtils::mb_ltrim(' asdf '));
self::assertEquals('asdf%', MbStringUtils::mb_ltrim('%asdf%', '%'));
}
/**
* @testdox A multi-byte string can be checked if it contains at least one defined string element
* @covers phpOMS\Utils\MbStringUtils
* @group framework
*/
public function testContainsMb() : void
{
$string = 'This is a test string.';
self::assertTrue(MbStringUtils::mb_contains($string, ['is', 'nothing', 'string']));
self::assertFalse(MbStringUtils::mb_contains($string, ['iss', 'nothing', 'false']));
}
/**
* @testdox The characters of a multi-byte string can be counted
* @covers phpOMS\Utils\MbStringUtils
* @group framework
*/
public function testCountMb() : void
{
self::assertEquals(5, MbStringUtils::mb_count_chars('αααααΕεΙιΜμΨψ')['α']);
}
}

View File

@ -33,7 +33,7 @@ class StringUtilsTest extends \PHPUnit\Framework\TestCase
*/
public function testEntropy() : void
{
self::assertTrue(\abs(2.5 - StringUtils::getEntropy('akj@!0aj')) < 0.1);
self::assertEqualsWithDelta(2.5, StringUtils::entropy('akj@!0aj'), 0.1);
}
/**
@ -60,89 +60,6 @@ class StringUtilsTest extends \PHPUnit\Framework\TestCase
self::assertFalse(StringUtils::endsWith($string, 'strng.'));
}
/**
* @testdox A multi-byte string can be checked if it starts with a defined string
* @covers phpOMS\Utils\StringUtils
* @group framework
*/
public function testStartsMb() : void
{
$string = 'This is a test string.';
self::assertTrue(StringUtils::mb_startsWith($string, 'This '));
self::assertFalse(StringUtils::mb_startsWith($string, 'Thss '));
}
/**
* @testdox A multi-byte string can be checked if it ends with a defined string
* @covers phpOMS\Utils\StringUtils
* @group framework
*/
public function testEndsMb() : void
{
$string = 'This is a test string.';
self::assertTrue(StringUtils::mb_endsWith($string, 'string.'));
self::assertFalse(StringUtils::mb_endsWith($string, 'strng.'));
}
/**
* @testdox The first character of a multi-byte string can be turned into upper case
* @covers phpOMS\Utils\StringUtils
* @group framework
*/
public function testTransformUpperCase() : void
{
self::assertEquals('This ', StringUtils::mb_ucfirst('this '));
self::assertNotEquals('this ', StringUtils::mb_ucfirst('this '));
}
/**
* @testdox The first character of a multi-byte string can be turned into lower case
* @covers phpOMS\Utils\StringUtils
* @group framework
*/
public function testTransformLowerCase() : void
{
self::assertEquals('thss', StringUtils::mb_lcfirst('Thss'));
self::assertNotEquals('Thss', StringUtils::mb_lcfirst('Thss'));
}
/**
* @testdox A multi-byte string can be trimmed
* @covers phpOMS\Utils\StringUtils
* @group framework
*/
public function testTrim() : void
{
$string = 'This is a test string.';
self::assertEquals($string, StringUtils::mb_trim($string, ' '));
self::assertEquals('This is a test string', StringUtils::mb_trim($string, '.'));
self::assertEquals('asdf', StringUtils::mb_trim(' asdf ', ' '));
self::assertEquals('asdf', StringUtils::mb_trim('%asdf%', '%'));
}
/**
* @testdox A multi-byte string can be right-trimmed
* @covers phpOMS\Utils\StringUtils
* @group framework
*/
public function testRTrim() : void
{
self::assertEquals(' asdf', StringUtils::mb_rtrim(' asdf '));
self::assertEquals('%asdf', StringUtils::mb_rtrim('%asdf%', '%'));
}
/**
* @testdox A multi-byte string can be left-trimmed
* @covers phpOMS\Utils\StringUtils
* @group framework
*/
public function testLTrim() : void
{
self::assertEquals('asdf ', StringUtils::mb_ltrim(' asdf '));
self::assertEquals('asdf%', StringUtils::mb_ltrim('%asdf%', '%'));
}
/**
* @testdox A string can be checked if it contains at least one defined string element
* @covers phpOMS\Utils\StringUtils
@ -156,29 +73,6 @@ class StringUtilsTest extends \PHPUnit\Framework\TestCase
self::assertFalse(StringUtils::contains($string, ['iss', 'nothing', 'false']));
}
/**
* @testdox A multi-byte string can be checked if it contains at least one defined string element
* @covers phpOMS\Utils\StringUtils
* @group framework
*/
public function testContainsMb() : void
{
$string = 'This is a test string.';
self::assertTrue(StringUtils::mb_contains($string, ['is', 'nothing', 'string']));
self::assertFalse(StringUtils::mb_contains($string, ['iss', 'nothing', 'false']));
}
/**
* @testdox The characters of a multi-byte string can be counted
* @covers phpOMS\Utils\StringUtils
* @group framework
*/
public function testCountMb() : void
{
self::assertEquals(5, StringUtils::mb_count_chars('αααααΕεΙιΜμΨψ')['α']);
}
/**
* @testdox The amount of a defined characters in the beginning of a string can be counted
* @covers phpOMS\Utils\StringUtils