| 1 | <?php␊ |
| 2 | /* -*- tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */␊ |
| 3 | /*␊ |
| 4 | # ***** BEGIN LICENSE BLOCK *****␊ |
| 5 | # This file is part of Plume Framework, a simple PHP Application Framework.␊ |
| 6 | # Copyright (C) 2001-2007 Loic d'Anterroches and contributors.␊ |
| 7 | #␊ |
| 8 | # Plume Framework is free software; you can redistribute it and/or modify␊ |
| 9 | # it under the terms of the GNU Lesser General Public License as published by␊ |
| 10 | # the Free Software Foundation; either version 2.1 of the License, or␊ |
| 11 | # (at your option) any later version.␊ |
| 12 | #␊ |
| 13 | # Plume Framework is distributed in the hope that it will be useful,␊ |
| 14 | # but WITHOUT ANY WARRANTY; without even the implied warranty of␊ |
| 15 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the␊ |
| 16 | # GNU Lesser General Public License for more details.␊ |
| 17 | #␊ |
| 18 | # You should have received a copy of the GNU Lesser General Public License␊ |
| 19 | # along with this program; if not, write to the Free Software␊ |
| 20 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA␊ |
| 21 | #␊ |
| 22 | # ***** END LICENSE BLOCK ***** */␊ |
| 23 | ␊ |
| 24 | /**␊ |
| 25 | * Utility class to clean/manipulate strings. ␊ |
| 26 | */␊ |
| 27 | ␊ |
| 28 | class Pluf_Text␊ |
| 29 | {␊ |
| 30 | /**␊ |
| 31 | * Wrap a string containing HTML code.␊ |
| 32 | *␊ |
| 33 | * The HTML is not broken, words are broken only if very long. ␊ |
| 34 | *␊ |
| 35 | * Improved from a version available on php.net␊ |
| 36 | *␊ |
| 37 | * @see http://www.php.net/manual/en/function.wordwrap.php#89782␊ |
| 38 | *␊ |
| 39 | * @param string The string to wrap␊ |
| 40 | * @param int The maximal length of a string (45)␊ |
| 41 | * @param string Wrap string ("\n")␊ |
| 42 | * @return string Wrapped string␊ |
| 43 | */␊ |
| 44 | public static function wrapHtml($string, $length=45, $wrapString="\n")␊ |
| 45 | {␊ |
| 46 | $wrapped = '';␊ |
| 47 | $word = '';␊ |
| 48 | $html = false;␊ |
| 49 | $line_len = 0;␊ |
| 50 | $n = mb_strlen($string);␊ |
| 51 | for ($i=0; $i<$n; $i++) {␊ |
| 52 | $char = mb_substr($string, $i, 1);␊ |
| 53 | /** HTML Begins */␊ |
| 54 | if ($char === '<') {␊ |
| 55 | if (!empty($word)) {␊ |
| 56 | $line_len += mb_strlen($word);␊ |
| 57 | $wrapped .= $word;␊ |
| 58 | $word = '';␊ |
| 59 | }␊ |
| 60 | $html = true;␊ |
| 61 | $wrapped .= $char;␊ |
| 62 | continue;␊ |
| 63 | } ␊ |
| 64 | if ($char === '>') {␊ |
| 65 | /** HTML ends */␊ |
| 66 | $html = false;␊ |
| 67 | $wrapped .= $char;␊ |
| 68 | continue;␊ |
| 69 | } ␊ |
| 70 | if ($html) {␊ |
| 71 | /** If this is inside HTML -> append to the wrapped string */␊ |
| 72 | $wrapped .= $char;␊ |
| 73 | continue;␊ |
| 74 | } ␊ |
| 75 | if ($char === $wrapString) {␊ |
| 76 | /** Whitespace characted / new line */␊ |
| 77 | $wrapped .= $word.$char;␊ |
| 78 | $word = '';␊ |
| 79 | $line_len = 0;␊ |
| 80 | continue;␊ |
| 81 | } ␊ |
| 82 | if (in_array($char, array(' ', "\t"))) {␊ |
| 83 | // Word delimiter, check if split before it needed␊ |
| 84 | $word .= $char;␊ |
| 85 | if (mb_strlen($word) + $line_len <= $length) {␊ |
| 86 | $line_len += mb_strlen($word);␊ |
| 87 | $wrapped .= $word;␊ |
| 88 | $word = '';␊ |
| 89 | } else {␊ |
| 90 | // If we add the word, it will be above the limit␊ |
| 91 | $line_len = mb_strlen($word);␊ |
| 92 | $wrapped .= $wrapString.$word;␊ |
| 93 | $word = '';␊ |
| 94 | }␊ |
| 95 | continue;␊ |
| 96 | }␊ |
| 97 | /** Check chars */␊ |
| 98 | ␊ |
| 99 | $word .= $char;␊ |
| 100 | if (mb_strlen($word) + $line_len > $length) {␊ |
| 101 | $wrapped .= $wrapString;␊ |
| 102 | $line_len = 0;␊ |
| 103 | continue;␊ |
| 104 | } ␊ |
| 105 | if (mb_strlen($word) >= $length) {␊ |
| 106 | $wrapped .= $word.$wrapString;␊ |
| 107 | $word = '';␊ |
| 108 | $line_len = 0;␊ |
| 109 | continue;␊ |
| 110 | } ␊ |
| 111 | }␊ |
| 112 | if ($word !== '') {␊ |
| 113 | $wrapped .= $word;␊ |
| 114 | }␊ |
| 115 | return $wrapped;␊ |
| 116 | }␊ |
| 117 | ␊ |
| 118 | /**␊ |
| 119 | * Given a string, cleaned from the not interesting characters,␊ |
| 120 | * returns an array with the words as index and the number of␊ |
| 121 | * times it was in the text as the value.␊ |
| 122 | *␊ |
| 123 | * @credits Tokenizer of DokuWiki to handle Thai and CJK words.␊ |
| 124 | * http://www.splitbrain.org/projects/dokuwiki␊ |
| 125 | *␊ |
| 126 | * @param string Cleaned, lowercased and utf-8 encoded string.␊ |
| 127 | * @param bool Remove the accents (True)␊ |
| 128 | * @return array Word and number of occurences.␊ |
| 129 | */␊ |
| 130 | public static function tokenize($string, $remove_accents=True)␊ |
| 131 | {␊ |
| 132 | if ($remove_accents) {␊ |
| 133 | $string = self::removeAccents($string);␊ |
| 134 | }␊ |
| 135 | $asian1 = '[\x{0E00}-\x{0E7F}]'; // Thai␊ |
| 136 | $asian2 = '['.␊ |
| 137 | '\x{2E80}-\x{3040}'. // CJK -> Hangul␊ |
| 138 | '\x{309D}-\x{30A0}'.␊ |
| 139 | '\x{30FD}-\x{31EF}\x{3200}-\x{D7AF}'.␊ |
| 140 | '\x{F900}-\x{FAFF}'. // CJK Compatibility Ideographs␊ |
| 141 | '\x{FE30}-\x{FE4F}'. // CJK Compatibility Forms␊ |
| 142 | ']';␊ |
| 143 | $asian3 = '['. // Hiragana/Katakana (can be two characters)␊ |
| 144 | '\x{3042}\x{3044}\x{3046}\x{3048}'.␊ |
| 145 | '\x{304A}-\x{3062}\x{3064}-\x{3082}'.␊ |
| 146 | '\x{3084}\x{3086}\x{3088}-\x{308D}'.␊ |
| 147 | '\x{308F}-\x{3094}'.␊ |
| 148 | '\x{30A2}\x{30A4}\x{30A6}\x{30A8}'.␊ |
| 149 | '\x{30AA}-\x{30C2}\x{30C4}-\x{30E2}'.␊ |
| 150 | '\x{30E4}\x{30E6}\x{30E8}-\x{30ED}'.␊ |
| 151 | '\x{30EF}-\x{30F4}\x{30F7}-\x{30FA}'.␊ |
| 152 | ']['.␊ |
| 153 | '\x{3041}\x{3043}\x{3045}\x{3047}\x{3049}'.␊ |
| 154 | '\x{3063}\x{3083}\x{3085}\x{3087}\x{308E}\x{3095}-\x{309C}'.␊ |
| 155 | '\x{30A1}\x{30A3}\x{30A5}\x{30A7}\x{30A9}'.␊ |
| 156 | '\x{30C3}\x{30E3}\x{30E5}\x{30E7}\x{30EE}\x{30F5}\x{30F6}\x{30FB}\x{30FC}'.␊ |
| 157 | '\x{31F0}-\x{31FF}'.␊ |
| 158 | ']?';␊ |
| 159 | $asian = '(?:'.$asian1.'|'.$asian2.'|'.$asian3.')';␊ |
| 160 | $words = array();␊ |
| 161 | // handle asian chars as single words.␊ |
| 162 | $asia = @preg_replace('/('.$asian.')/u',' \1 ',$string);␊ |
| 163 | if (!is_null($asia)) {␊ |
| 164 | //will not be called if regexp failure␊ |
| 165 | $string = $asia;␊ |
| 166 | }␊ |
| 167 | $arr = preg_split('/\s+/', $string, -1, PREG_SPLIT_NO_EMPTY);␊ |
| 168 | foreach ($arr as $w) {␊ |
| 169 | $w = trim($w);␊ |
| 170 | if (isset($words[$w])) {␊ |
| 171 | $words[$w]++;␊ |
| 172 | } else {␊ |
| 173 | $words[$w] = 1;␊ |
| 174 | }␊ |
| 175 | }␊ |
| 176 | return $words;␊ |
| 177 | }␊ |
| 178 | ␊ |
| 179 | /**␊ |
| 180 | * Clean a string from the HTML and the unnecessary␊ |
| 181 | * punctuation. Convert the string to lowercase.␊ |
| 182 | *␊ |
| 183 | * @info Require mbstring extension.␊ |
| 184 | *␊ |
| 185 | * @param string String.␊ |
| 186 | * @return string Cleaned lowercase string.␊ |
| 187 | */␊ |
| 188 | public static function cleanString($string)␊ |
| 189 | {␊ |
| 190 | $string = html_entity_decode($string, ENT_QUOTES, 'utf-8');␊ |
| 191 | $string = str_replace('<?php', '', $string);␊ |
| 192 | $string = strip_tags($string);␊ |
| 193 | $string = strtr($string, "\r\n\t", ' ');␊ |
| 194 | $string = strtr($string, ␊ |
| 195 | '.<>,;:(){}[]\\|*@!?^_=/\'~`%$#',␊ |
| 196 | ' ');␊ |
| 197 | return mb_strtolower($string, 'UTF-8');␊ |
| 198 | }␊ |
| 199 | ␊ |
| 200 | /**␊ |
| 201 | * Remove the accentuated characters.␊ |
| 202 | *␊ |
| 203 | * Requires a string in lowercase, the removal is not perfect but␊ |
| 204 | * is better than nothing.␊ |
| 205 | *␊ |
| 206 | * @param string Lowercased string in utf-8.␊ |
| 207 | * @return string String with some of the accents removed.␊ |
| 208 | */␊ |
| 209 | public static function removeAccents($string)␊ |
| 210 | {␊ |
| 211 | $map = array(␊ |
| 212 | 'à'=>'a', 'ô'=>'o', 'ď'=>'d', 'ḟ'=>'f', 'ë'=>'e',␊ |
| 213 | 'š'=>'s', 'ơ'=>'o', 'ß'=>'ss', 'ă'=>'a', 'ř'=>'r', ␊ |
| 214 | 'ț'=>'t', 'ň'=>'n', 'ā'=>'a', 'ķ'=>'k', 'ŝ'=>'s', ␊ |
| 215 | 'ỳ'=>'y', 'ņ'=>'n', 'ĺ'=>'l', 'ħ'=>'h', 'ṗ'=>'p', ␊ |
| 216 | 'ó'=>'o', 'ú'=>'u', 'ě'=>'e', 'é'=>'e', 'ç'=>'c',␊ |
| 217 | 'ẁ'=>'w', 'ċ'=>'c', 'õ'=>'o', 'ṡ'=>'s', 'ø'=>'o', ␊ |
| 218 | 'ģ'=>'g', 'ŧ'=>'t', 'ș'=>'s', 'ė'=>'e', 'ĉ'=>'c',␊ |
| 219 | 'ś'=>'s', 'î'=>'i', 'ű'=>'u', 'ć'=>'c', 'ę'=>'e', ␊ |
| 220 | 'ŵ'=>'w', 'ṫ'=>'t', 'ū'=>'u', 'č'=>'c', 'ö'=>'oe', ␊ |
| 221 | 'è'=>'e', 'ŷ'=>'y', 'ą'=>'a', 'ł'=>'l', 'ų'=>'u', ␊ |
| 222 | 'ů'=>'u', 'ş'=>'s', 'ğ'=>'g', 'ļ'=>'l', 'ƒ'=>'f', ␊ |
| 223 | 'ž'=>'z', 'ẃ'=>'w', 'ḃ'=>'b', 'å'=>'a', 'ì'=>'i', ␊ |
| 224 | 'ï'=>'i', 'ḋ'=>'d', 'ť'=>'t', 'ŗ'=>'r', 'ä'=>'ae', ␊ |
| 225 | 'í'=>'i', 'ŕ'=>'r', 'ê'=>'e', 'ü'=>'ue', 'ò'=>'o',␊ |
| 226 | 'ē'=>'e', 'ñ'=>'n', 'ń'=>'n', 'ĥ'=>'h', 'ĝ'=>'g', ␊ |
| 227 | 'đ'=>'d', 'ĵ'=>'j', 'ÿ'=>'y', 'ũ'=>'u', 'ŭ'=>'u', ␊ |
| 228 | 'ư'=>'u', 'ţ'=>'t', 'ý'=>'y', 'ő'=>'o', 'â'=>'a', ␊ |
| 229 | 'ľ'=>'l', 'ẅ'=>'w', 'ż'=>'z', 'ī'=>'i', 'ã'=>'a', ␊ |
| 230 | 'ġ'=>'g', 'ṁ'=>'m', 'ō'=>'o', 'ĩ'=>'i', 'ù'=>'u', ␊ |
| 231 | 'į'=>'i', 'ź'=>'z', 'á'=>'a', 'û'=>'u', 'þ'=>'th', ␊ |
| 232 | 'ð'=>'dh', 'æ'=>'ae', 'µ'=>'u', 'ĕ'=>'e',␊ |
| 233 | );␊ |
| 234 | return strtr($string, $map);␊ |
| 235 | }␊ |
| 236 | ␊ |
| 237 | /**␊ |
| 238 | * Convert a string to a list of characters.␊ |
| 239 | *␊ |
| 240 | * @param string utf-8 encoded string.␊ |
| 241 | * @return array Characters.␊ |
| 242 | */␊ |
| 243 | public static function stringToChars($string)␊ |
| 244 | {␊ |
| 245 | $chars = array();␊ |
| 246 | $strlen = mb_strlen($string, 'UTF-8');␊ |
| 247 | for ($i=0;$i<$strlen;$i++) {␊ |
| 248 | $chars[] = mb_substr($string,$i, 1, 'UTF-8');␊ |
| 249 | }␊ |
| 250 | return $chars;␊ |
| 251 | }␊ |
| 252 | ␊ |
| 253 | /**␊ |
| 254 | * Prevent a string to be all uppercase. ␊ |
| 255 | *␊ |
| 256 | * If more than 50% of the words in the string are uppercases and␊ |
| 257 | * if the string contains more than one word, the string is␊ |
| 258 | * converted using the mb_convert_case.␊ |
| 259 | *␊ |
| 260 | * @see http://www.php.net/mb_convert_case␊ |
| 261 | *␊ |
| 262 | * @param string String to test.␊ |
| 263 | * @param int Mode to convert the string (MB_CASE_TITLE)␊ |
| 264 | * @return string Cleaned string.␊ |
| 265 | */␊ |
| 266 | public static function preventUpperCase($string, $mode=MB_CASE_TITLE)␊ |
| 267 | {␊ |
| 268 | $elts = mb_split(' ', $string);␊ |
| 269 | $n_elts = count($elts);␊ |
| 270 | if ($n_elts > 1) {␊ |
| 271 | $tot = 0;␊ |
| 272 | foreach ($elts as $elt) {␊ |
| 273 | if ($elt == '') {␊ |
| 274 | $n_elts--;␊ |
| 275 | continue;␊ |
| 276 | }␊ |
| 277 | if ($elt == mb_strtoupper($elt, 'UTF-8')) {␊ |
| 278 | $tot++;␊ |
| 279 | }␊ |
| 280 | }␊ |
| 281 | if ( (float) $tot / (float) $n_elts >= 0.5) {␊ |
| 282 | return mb_convert_case(mb_strtolower($string, 'UTF-8'), ␊ |
| 283 | $mode, 'UTF-8');␊ |
| 284 | }␊ |
| 285 | }␊ |
| 286 | return $string;␊ |
| 287 | }␊ |
| 288 | ␊ |
| 289 | /**␊ |
| 290 | * Simple uppercase prevention.␊ |
| 291 | *␊ |
| 292 | * Contrary to self::preventUpperCase, this method will also␊ |
| 293 | * prevent a single word to be uppercase.␊ |
| 294 | *␊ |
| 295 | * @param string String possibly in uppercase.␊ |
| 296 | * @param int Mode to convert the string (MB_CASE_TITLE)␊ |
| 297 | * @return string Mode cased if all uppercase in input.␊ |
| 298 | */␊ |
| 299 | public static function simplePreventUpperCase($string, $mode=MB_CASE_TITLE)␊ |
| 300 | {␊ |
| 301 | if ($string == mb_strtoupper($string)) {␊ |
| 302 | return mb_convert_case(mb_strtolower($string), $mode, 'UTF-8');␊ |
| 303 | }␊ |
| 304 | return $string;␊ |
| 305 | }␊ |
| 306 | }␊ |
| 307 | |