Commit 7f346522 by Scott

Coding style (util/string)

parent 769b2d38
......@@ -20,24 +20,25 @@
More about this license: http://www.question2answer.org/license.php
*/
if (!defined('QA_VERSION')) { // don't allow this page to be requested directly from browser
if (!defined('QA_VERSION')) { // don't allow this page to be requested directly from browser
header('Location: ../');
exit;
}
}
// Functions
function qa_string_initialize()
/*
Set up some global tables to be used by other functions in this file
*/
{
/**
* Set up some global tables to be used by other functions in this file
*/
function qa_string_initialize()
{
if (qa_to_override(__FUNCTION__)) { $args=func_get_args(); return qa_call_override(__FUNCTION__, $args); }
global $qa_utf8punctuation, $qa_utf8removeaccents;
$qa_utf8punctuation=array( // converts UTF-8 punctuation characters to spaces (or in some cases, hyphens)
// converts UTF-8 punctuation characters to spaces (or in some cases, hyphens)
$qa_utf8punctuation = array(
"\xC2\xA1" => ' ', // INVERTED EXCLAMATION MARK
"\xC2\xA6" => ' ', // BROKEN BAR
"\xC2\xAB" => ' ', // LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
......@@ -97,7 +98,8 @@
"\xE3\x80\x82" => ' ', // IDEOGRAPHIC FULL STOP
);
$qa_utf8removeaccents=array( // convert UTF-8 accented characters to basic Roman characters
// convert UTF-8 accented characters to basic Roman characters
$qa_utf8removeaccents = array(
"\xC3\x80" => 'A', // LATIN CAPITAL LETTER A WITH GRAVE
"\xC3\x81" => 'A', // LATIN CAPITAL LETTER A WITH ACUTE
"\xC3\x82" => 'A', // LATIN CAPITAL LETTER A WITH CIRCUMFLEX
......@@ -412,68 +414,74 @@
"\xC8\xB6" => 't', // LATIN SMALL LETTER T WITH CURL
"\xC8\xB7" => 'j', // LATIN SMALL LETTER DOTLESS J
);
}
function qa_string_to_words($string, $tolowercase=true, $delimiters=false, $splitideographs=true, $splithyphens=true)
/*
Return the UTF-8 input string converted into an array of words, changed $tolowercase (or not).
Set $delimiters to true to keep the delimiters after each word and tweak what we used for word
splitting with $splitideographs and $splithyphens.
*/
{
}
/**
* Return the UTF-8 input string converted into an array of words, changed $tolowercase (or not).
* Set $delimiters to true to keep the delimiters after each word and tweak what we used for word
* splitting with $splitideographs and $splithyphens.
* @param $string
* @param bool $tolowercase
* @param bool $delimiters
* @param bool $splitideographs
* @param bool $splithyphens
* @return array
*/
function qa_string_to_words($string, $tolowercase = true, $delimiters = false, $splitideographs = true, $splithyphens = true)
{
if (qa_to_override(__FUNCTION__)) { $args=func_get_args(); return qa_call_override(__FUNCTION__, $args); }
global $qa_utf8punctuation;
if ($tolowercase)
$string=qa_strtolower($string);
$string = qa_strtolower($string);
$string=strtr($string, $qa_utf8punctuation);
$string = strtr($string, $qa_utf8punctuation);
$separator=QA_PREG_INDEX_WORD_SEPARATOR;
$separator = QA_PREG_INDEX_WORD_SEPARATOR;
if ($splithyphens)
$separator.='|\-';
$separator .= '|\-';
if ($delimiters) {
if ($splitideographs)
$separator.='|'.QA_PREG_CJK_IDEOGRAPHS_UTF8;
$separator .= '|' . QA_PREG_CJK_IDEOGRAPHS_UTF8;
} else {
$string=preg_replace("/(\S)'(\S)/", '\1\2', $string); // remove apostrophes in words
$string = preg_replace("/(\S)'(\S)/", '\1\2', $string); // remove apostrophes in words
if ($splitideographs) // put spaces around CJK ideographs so they're treated as separate words
$string=preg_replace('/'.QA_PREG_CJK_IDEOGRAPHS_UTF8.'/', ' \0 ', $string);
$string = preg_replace('/' . QA_PREG_CJK_IDEOGRAPHS_UTF8 . '/', ' \0 ', $string);
}
return preg_split('/('.$separator.'+)/', $string, -1, PREG_SPLIT_NO_EMPTY | ($delimiters ? PREG_SPLIT_DELIM_CAPTURE : 0));
}
return preg_split('/(' . $separator . '+)/', $string, -1, PREG_SPLIT_NO_EMPTY | ($delimiters ? PREG_SPLIT_DELIM_CAPTURE : 0));
}
/**
/**
* Convert accents in a UTF-8 string to ASCII.
* @param string $string Input string.
* @return string
*/
function qa_string_remove_accents($string)
{
function qa_string_remove_accents($string)
{
if (qa_to_override(__FUNCTION__)) { $args=func_get_args(); return qa_call_override(__FUNCTION__, $args); }
global $qa_utf8removeaccents;
return strtr($string, $qa_utf8removeaccents);
}
}
/**
/**
* Convert string to an SEO-friendly URL segment.
* @param string $string Input string.
* @param bool $asciiOnly If true, removes all non-ASCII characters that weren't converted.
* @param int|null $maxLength Maximum length the segment should be, or null for no limit.
* @return string
*/
function qa_slugify($string, $asciiOnly=true, $maxLength=null)
{
function qa_slugify($string, $asciiOnly = true, $maxLength = null)
{
if (qa_to_override(__FUNCTION__)) { $args=func_get_args(); return qa_call_override(__FUNCTION__, $args); }
$words = qa_string_to_words($string, true, false, false);
......@@ -485,7 +493,7 @@
}
$remaining = $maxLength;
if (array_sum($wordlength)>$remaining) {
if (array_sum($wordlength) > $remaining) {
arsort($wordlength, SORT_NUMERIC); // sort with longest words first
foreach ($wordlength as $index => $length) {
......@@ -508,32 +516,36 @@
}
return $string;
}
}
function qa_tags_to_tagstring($tags)
/*
Convert an array of tags into a string for storage in the database
*/
{
/**
* Convert an array of tags into a string for storage in the database
* @param $tags
* @return mixed|string
*/
function qa_tags_to_tagstring($tags)
{
if (qa_to_override(__FUNCTION__)) { $args=func_get_args(); return qa_call_override(__FUNCTION__, $args); }
return implode(',', $tags);
}
}
function qa_tagstring_to_tags($tagstring)
/*
Convert a tag string as stored in the database into an array of tags
*/
{
/**
* Convert a tag string as stored in the database into an array of tags
* @param $tagstring
* @return array|mixed
*/
function qa_tagstring_to_tags($tagstring)
{
if (qa_to_override(__FUNCTION__)) { $args=func_get_args(); return qa_call_override(__FUNCTION__, $args); }
return empty($tagstring) ? array() : explode(',', $tagstring);
}
}
/**
/**
* Converts a string to a single line and removes words from it until it fits in the given length. Words are removed
* from a position around two thirds of the string and are replaced by the given ellipsis string
*
......@@ -543,8 +555,8 @@
* @param string $ellipsis Text used to replace the removed words from the original text
* @return string The string turned into a single line and cut to fit the given length
*/
function qa_shorten_string_line($string, $length, $ellipsis = ' ... ')
{
function qa_shorten_string_line($string, $length, $ellipsis = ' ... ')
{
if (qa_to_override(__FUNCTION__)) { $args=func_get_args(); return qa_call_override(__FUNCTION__, $args); }
$string = strtr($string, "\r\n\t", ' ');
......@@ -580,65 +592,72 @@
}
return $string;
}
}
/**
/**
* Removes 4-byte Unicode characters (e.g. emoji) from a string due to missing support in MySQL < 5.5.3.
* @param string $string
* @return string
*/
function qa_remove_utf8mb4($string)
{
function qa_remove_utf8mb4($string)
{
return preg_replace('%(?:
\xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3
| [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15
| \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16
)%xs', '', $string);
}
}
function qa_block_words_explode($wordstring)
/*
Return an array of the words within $wordstring, each of which can contain asterisks
*/
{
/**
* Return an array of the words within $wordstring, each of which can contain asterisks
* @param $wordstring
* @return array|mixed
*/
function qa_block_words_explode($wordstring)
{
if (qa_to_override(__FUNCTION__)) { $args=func_get_args(); return qa_call_override(__FUNCTION__, $args); }
return preg_split('/'.QA_PREG_BLOCK_WORD_SEPARATOR.'+/', $wordstring, -1, PREG_SPLIT_NO_EMPTY);
}
return preg_split('/' . QA_PREG_BLOCK_WORD_SEPARATOR . '+/', $wordstring, -1, PREG_SPLIT_NO_EMPTY);
}
function qa_block_words_to_preg($wordsstring)
/*
Return a regular expression fragment corresponding to the block words $wordstring
*/
{
/**
* Return a regular expression fragment corresponding to the block words $wordstring
* @param $wordsstring
* @return mixed|string
*/
function qa_block_words_to_preg($wordsstring)
{
if (qa_to_override(__FUNCTION__)) { $args=func_get_args(); return qa_call_override(__FUNCTION__, $args); }
$blockwords=qa_block_words_explode($wordsstring);
$patterns=array();
$blockwords = qa_block_words_explode($wordsstring);
$patterns = array();
foreach ($blockwords as $blockword) { // * in rule maps to [^ ]* in regular expression
$pattern=str_replace('\\*', '[^ ]*', preg_quote(qa_strtolower($blockword), '/'));
$pattern = str_replace('\\*', '[^ ]*', preg_quote(qa_strtolower($blockword), '/'));
if (!preg_match('/^('.QA_PREG_CJK_IDEOGRAPHS_UTF8.')/', $blockword))
$pattern='(?<= )'.$pattern; // assert leading word delimiter if pattern does not start with CJK
if (!preg_match('/^(' . QA_PREG_CJK_IDEOGRAPHS_UTF8 . ')/', $blockword))
$pattern = '(?<= )' . $pattern; // assert leading word delimiter if pattern does not start with CJK
if (!preg_match('/('.QA_PREG_CJK_IDEOGRAPHS_UTF8.')$/', $blockword))
$pattern=$pattern.'(?= )'; // assert trailing word delimiter if pattern does not end with CJK
if (!preg_match('/(' . QA_PREG_CJK_IDEOGRAPHS_UTF8 . ')$/', $blockword))
$pattern = $pattern . '(?= )'; // assert trailing word delimiter if pattern does not end with CJK
$patterns[]=$pattern;
$patterns[] = $pattern;
}
return implode('|', $patterns);
}
}
function qa_block_words_match_all($string, $wordspreg)
/*
Return an array of matches of the regular expression fragment $wordspreg in $string, [offset] => [length]
*/
{
/**
* Return an array of matches of the regular expression fragment $wordspreg in $string, [offset] => [length]
* @param $string
* @param $wordspreg
* @return array
*/
function qa_block_words_match_all($string, $wordspreg)
{
if (qa_to_override(__FUNCTION__)) { $args=func_get_args(); return qa_call_override(__FUNCTION__, $args); }
global $qa_utf8punctuation, $qa_utf8punctuation_keeplength;
......@@ -647,119 +666,138 @@
// replace all word separators with spaces of same length
if (!is_array($qa_utf8punctuation_keeplength)) {
$qa_utf8punctuation_keeplength=array();
$qa_utf8punctuation_keeplength = array();
foreach ($qa_utf8punctuation as $key => $value)
$qa_utf8punctuation_keeplength[$key]=str_repeat(' ', strlen($key));
$qa_utf8punctuation_keeplength[$key] = str_repeat(' ', strlen($key));
}
$string=strtr(qa_strtolower($string), $qa_utf8punctuation_keeplength);
// assumes UTF-8 case conversion in qa_strtolower does not change byte length
$string=preg_replace('/'.QA_PREG_BLOCK_WORD_SEPARATOR.'/', ' ', $string);
$string = strtr(qa_strtolower($string), $qa_utf8punctuation_keeplength);
$string = preg_replace('/' . QA_PREG_BLOCK_WORD_SEPARATOR . '/', ' ', $string);
preg_match_all('/'.$wordspreg.'/', ' '.$string.' ', $pregmatches, PREG_OFFSET_CAPTURE);
preg_match_all('/' . $wordspreg . '/', ' ' . $string . ' ', $pregmatches, PREG_OFFSET_CAPTURE);
$outmatches=array();
$outmatches = array();
foreach ($pregmatches[0] as $pregmatch)
$outmatches[$pregmatch[1]-1]=strlen($pregmatch[0]);
$outmatches[$pregmatch[1] - 1] = strlen($pregmatch[0]);
return $outmatches;
}
return array();
}
}
function qa_block_words_replace($string, $wordspreg, $character='*')
/*
Return $string with any words matching the regular expression fragment $wordspreg replaced with repeated $character
*/
{
/**
* Return $string with any words matching the regular expression fragment $wordspreg replaced with repeated $character
* @param string $string
* @param string $wordspreg
* @param string $character
* @return mixed
*/
function qa_block_words_replace($string, $wordspreg, $character = '*')
{
if (qa_to_override(__FUNCTION__)) { $args=func_get_args(); return qa_call_override(__FUNCTION__, $args); }
if (strlen($wordspreg)) {
$matches=qa_block_words_match_all($string, $wordspreg);
$matches = qa_block_words_match_all($string, $wordspreg);
krsort($matches, SORT_NUMERIC);
foreach ($matches as $start => $length) // get length again below to deal with multi-byte characters
$string=substr_replace($string, str_repeat($character, qa_strlen(substr($string, $start, $length))), $start, $length);
$string = substr_replace($string, str_repeat($character, qa_strlen(substr($string, $start, $length))), $start, $length);
}
return $string;
}
}
function qa_random_alphanum($length)
/*
Return a random alphanumeric string (base 36) of $length
*/
{
$string='';
/**
* Return a random alphanumeric string (base 36) of $length
* @param $length
* @return string
*/
function qa_random_alphanum($length)
{
$string = '';
while (strlen($string)<$length)
$string.=str_pad(base_convert(mt_rand(0, 46655), 10, 36), 3, '0', STR_PAD_LEFT);
while (strlen($string) < $length)
$string .= str_pad(base_convert(mt_rand(0, 46655), 10, 36), 3, '0', STR_PAD_LEFT);
return substr($string, 0, $length);
}
}
function qa_email_validate($email)
/*
Return true or false to indicate whether $email is a valid email (this is pretty flexible compared to most real emails out there)
*/
{
/**
* Return true or false to indicate whether $email is a valid email (this is pretty flexible compared to most real emails out there)
* @param $email
* @return bool|mixed
*/
function qa_email_validate($email)
{
if (qa_to_override(__FUNCTION__)) { $args=func_get_args(); return qa_call_override(__FUNCTION__, $args); }
return preg_match("/^[\-\!\#\$\%\&\'\*\+\/\=\?\_\`\{\|\}\~a-zA-Z0-9\.\^]+\@[a-zA-Z0-9\-]+\.[a-zA-Z0-9\.\-]+$/", $email) === 1;
}
}
function qa_strlen($string)
/*
Return the number of characters in $string, preferably using PHP's multibyte string functions
*/
{
/**
* Return the number of characters in $string, preferably using PHP's multibyte string functions
* @param $string
* @return int|mixed
*/
function qa_strlen($string)
{
if (qa_to_override(__FUNCTION__)) { $args=func_get_args(); return qa_call_override(__FUNCTION__, $args); }
return function_exists('mb_strlen') ? mb_strlen($string, 'UTF-8') : strlen($string);
}
}
function qa_strtolower($string)
/*
Return a lower case version of $string, preferably using PHP's multibyte string functions
*/
{
/**
* Return a lower case version of $string, preferably using PHP's multibyte string functions
* @param $string
* @return mixed|string
*/
function qa_strtolower($string)
{
if (qa_to_override(__FUNCTION__)) { $args=func_get_args(); return qa_call_override(__FUNCTION__, $args); }
return function_exists('mb_strtolower') ? mb_strtolower($string, 'UTF-8') : strtolower($string);
}
}
function qa_substr($string, $start, $length=2147483647)
/*
Return $length characters from $string, starting from $start, preferably using PHP's multibyte string functions
*/
{
/**
* Return $length characters from $string, starting from $start, preferably using PHP's multibyte string functions
* @param $string
* @param $start
* @param int $length
* @return mixed|string
*/
function qa_substr($string, $start, $length=2147483647)
{
if (qa_to_override(__FUNCTION__)) { $args=func_get_args(); return qa_call_override(__FUNCTION__, $args); }
return function_exists('mb_substr') ? mb_substr($string, $start, $length, 'UTF-8') : substr($string, $start, $length);
}
}
function qa_has_multibyte()
/*
Return whether this version of PHP has been compiled with multibyte string support
*/
{
/**
* Return whether this version of PHP has been compiled with multibyte string support
*/
function qa_has_multibyte()
{
return function_exists('mb_strlen') && function_exists('mb_strtolower');
}
}
function qa_string_matches_one($string, $matches)
/*
Return true if at least one of the values in array $matches is a substring of $string. Otherwise, return false.
*/
{
/**
* Return true if at least one of the values in array $matches is a substring of $string. Otherwise, return false.
* @param $string
* @param $matches
* @return bool
*/
function qa_string_matches_one($string, $matches)
{
if (strlen($string)) {
foreach ($matches as $match) {
if (strpos($string, $match) !== false)
......@@ -768,23 +806,18 @@
}
return false;
}
}
// Some static definitions and initialization
@define('QA_PREG_INDEX_WORD_SEPARATOR', '[\n\r\t\ \!\"\\\'\(\)\*\+\,\.\/\:\;\<\=\>\?\[\\\\\]\^\`\{\|\}\~]');
// Notable exclusions here: $ & - _ # % @
// Notable exclusions here: $ & - _ # % @
@define('QA_PREG_INDEX_WORD_SEPARATOR', '[\n\r\t\ \!\"\\\'\(\)\*\+\,\.\/\:\;\<\=\>\?\[\\\\\]\^\`\{\|\}\~]');
@define('QA_PREG_BLOCK_WORD_SEPARATOR', '[\n\r\t\ \!\"\\\'\(\)\+\,\.\/\:\;\<\=\>\?\[\\\\\]\^\`\{\|\}\~\$\&\-\_\#\%\@]');
// Asterisk (*) excluded here because it's used to match anything
// Asterisk (*) excluded here because it's used to match anything
@define('QA_PREG_BLOCK_WORD_SEPARATOR', '[\n\r\t\ \!\"\\\'\(\)\+\,\.\/\:\;\<\=\>\?\[\\\\\]\^\`\{\|\}\~\$\&\-\_\#\%\@]');
@define('QA_PREG_CJK_IDEOGRAPHS_UTF8', '\xE2[\xBA-\xBF][\x80-\xBF]|\xE3[\x80\x88-\xBF][\x80-\xBF]|[\xE4-\xE9][\x80-\xBF][\x80-\xBF]|\xEF[\xA4-\xAB][\x80-\xBF]|\xF0[\xA0-\xAF][\x80-\xBF][\x80-\xBF]');
// Pattern to match Chinese/Japanese/Korean ideographic symbols in UTF-8 encoding
// Pattern to match Chinese/Japanese/Korean ideographic symbols in UTF-8 encoding
@define('QA_PREG_CJK_IDEOGRAPHS_UTF8', '\xE2[\xBA-\xBF][\x80-\xBF]|\xE3[\x80\x88-\xBF][\x80-\xBF]|[\xE4-\xE9][\x80-\xBF][\x80-\xBF]|\xEF[\xA4-\xAB][\x80-\xBF]|\xF0[\xA0-\xAF][\x80-\xBF][\x80-\xBF]');
qa_string_initialize();
/*
Omit PHP closing tag to help avoid accidental output
*/
\ No newline at end of file
qa_string_initialize();
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment