Pluf Framework

Pluf Framework Git Source Tree

Root/src/Pluf/Text.php

1<?php
2/* -*- tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3/*
4# ***** BEGIN LICENSE BLOCK *****
5# This file is part of Plume Framework, a simple PHP Application Framework.
6# Copyright (C) 2001-2007 Loic d'Anterroches and contributors.
7#
8# Plume Framework is free software; you can redistribute it and/or modify
9# it under the terms of the GNU Lesser General Public License as published by
10# the Free Software Foundation; either version 2.1 of the License, or
11# (at your option) any later version.
12#
13# Plume Framework is distributed in the hope that it will be useful,
14# but WITHOUT ANY WARRANTY; without even the implied warranty of
15# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16# GNU Lesser General Public License for more details.
17#
18# You should have received a copy of the GNU Lesser General Public License
19# along with this program; if not, write to the Free Software
20# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
21#
22# ***** END LICENSE BLOCK ***** */
23
24/**
25 * Utility class to clean/manipulate strings.
26 */
27
28class Pluf_Text
29{
30 /**
31 * Wrap a string containing HTML code.
32 *
33 * The HTML is not broken, words are broken only if very long.
34 *
35 * Improved from a version available on php.net
36 *
37 * @see http://www.php.net/manual/en/function.wordwrap.php#89782
38 *
39 * @param string The string to wrap
40 * @param int The maximal length of a string (45)
41 * @param string Wrap string ("\n")
42 * @return string Wrapped string
43 */
44 public static function wrapHtml($string, $length=45, $wrapString="\n")
45 {
46 $wrapped = '';
47 $word = '';
48 $html = false;
49 $line_len = 0;
50 $n = mb_strlen($string);
51 for ($i=0; $i<$n; $i++) {
52 $char = mb_substr($string, $i, 1);
53 /** HTML Begins */
54 if ($char === '<') {
55 if (!empty($word)) {
56 $line_len += mb_strlen($word);
57 $wrapped .= $word;
58 $word = '';
59 }
60 $html = true;
61 $wrapped .= $char;
62 continue;
63 }
64 if ($char === '>') {
65 /** HTML ends */
66 $html = false;
67 $wrapped .= $char;
68 continue;
69 }
70 if ($html) {
71 /** If this is inside HTML -> append to the wrapped string */
72 $wrapped .= $char;
73 continue;
74 }
75 if ($char === $wrapString) {
76 /** Whitespace characted / new line */
77 $wrapped .= $word.$char;
78 $word = '';
79 $line_len = 0;
80 continue;
81 }
82 if (in_array($char, array(' ', "\t"))) {
83 // Word delimiter, check if split before it needed
84 $word .= $char;
85 if (mb_strlen($word) + $line_len <= $length) {
86 $line_len += mb_strlen($word);
87 $wrapped .= $word;
88 $word = '';
89 } else {
90 // If we add the word, it will be above the limit
91 $line_len = mb_strlen($word);
92 $wrapped .= $wrapString.$word;
93 $word = '';
94 }
95 continue;
96 }
97 /** Check chars */
98
99 $word .= $char;
100 if (mb_strlen($word) + $line_len > $length) {
101 $wrapped .= $wrapString;
102 $line_len = 0;
103 continue;
104 }
105 if (mb_strlen($word) >= $length) {
106 $wrapped .= $word.$wrapString;
107 $word = '';
108 $line_len = 0;
109 continue;
110 }
111 }
112 if ($word !== '') {
113 $wrapped .= $word;
114 }
115 return $wrapped;
116 }
117
118 /**
119 * Given a string, cleaned from the not interesting characters,
120 * returns an array with the words as index and the number of
121 * times it was in the text as the value.
122 *
123 * @credits Tokenizer of DokuWiki to handle Thai and CJK words.
124 * http://www.splitbrain.org/projects/dokuwiki
125 *
126 * @param string Cleaned, lowercased and utf-8 encoded string.
127 * @param bool Remove the accents (True)
128 * @return array Word and number of occurences.
129 */
130 public static function tokenize($string, $remove_accents=True)
131 {
132 if ($remove_accents) {
133 $string = self::removeAccents($string);
134 }
135 $asian1 = '[\x{0E00}-\x{0E7F}]'; // Thai
136 $asian2 = '['.
137 '\x{2E80}-\x{3040}'. // CJK -> Hangul
138 '\x{309D}-\x{30A0}'.
139 '\x{30FD}-\x{31EF}\x{3200}-\x{D7AF}'.
140 '\x{F900}-\x{FAFF}'. // CJK Compatibility Ideographs
141 '\x{FE30}-\x{FE4F}'. // CJK Compatibility Forms
142 ']';
143 $asian3 = '['. // Hiragana/Katakana (can be two characters)
144 '\x{3042}\x{3044}\x{3046}\x{3048}'.
145 '\x{304A}-\x{3062}\x{3064}-\x{3082}'.
146 '\x{3084}\x{3086}\x{3088}-\x{308D}'.
147 '\x{308F}-\x{3094}'.
148 '\x{30A2}\x{30A4}\x{30A6}\x{30A8}'.
149 '\x{30AA}-\x{30C2}\x{30C4}-\x{30E2}'.
150 '\x{30E4}\x{30E6}\x{30E8}-\x{30ED}'.
151 '\x{30EF}-\x{30F4}\x{30F7}-\x{30FA}'.
152 ']['.
153 '\x{3041}\x{3043}\x{3045}\x{3047}\x{3049}'.
154 '\x{3063}\x{3083}\x{3085}\x{3087}\x{308E}\x{3095}-\x{309C}'.
155 '\x{30A1}\x{30A3}\x{30A5}\x{30A7}\x{30A9}'.
156 '\x{30C3}\x{30E3}\x{30E5}\x{30E7}\x{30EE}\x{30F5}\x{30F6}\x{30FB}\x{30FC}'.
157 '\x{31F0}-\x{31FF}'.
158 ']?';
159 $asian = '(?:'.$asian1.'|'.$asian2.'|'.$asian3.')';
160 $words = array();
161 // handle asian chars as single words.
162 $asia = @preg_replace('/('.$asian.')/u',' \1 ',$string);
163 if (!is_null($asia)) {
164 //will not be called if regexp failure
165 $string = $asia;
166 }
167 $arr = preg_split('/\s+/', $string, -1, PREG_SPLIT_NO_EMPTY);
168 foreach ($arr as $w) {
169 $w = trim($w);
170 if (isset($words[$w])) {
171 $words[$w]++;
172 } else {
173 $words[$w] = 1;
174 }
175 }
176 return $words;
177 }
178
179 /**
180 * Clean a string from the HTML and the unnecessary
181 * punctuation. Convert the string to lowercase.
182 *
183 * @info Require mbstring extension.
184 *
185 * @param string String.
186 * @return string Cleaned lowercase string.
187 */
188 public static function cleanString($string)
189 {
190 $string = html_entity_decode($string, ENT_QUOTES, 'utf-8');
191 $string = str_replace('<?php', '', $string);
192 $string = strip_tags($string);
193 $string = strtr($string, "\r\n\t", ' ');
194 $string = strtr($string,
195 '.<>,;:(){}[]\\|*@!?^_=/\'~`%$#',
196 ' ');
197 return mb_strtolower($string, 'UTF-8');
198 }
199
200 /**
201 * Remove the accentuated characters.
202 *
203 * Requires a string in lowercase, the removal is not perfect but
204 * is better than nothing.
205 *
206 * @param string Lowercased string in utf-8.
207 * @return string String with some of the accents removed.
208 */
209 public static function removeAccents($string)
210 {
211 $map = array(
212 'à'=>'a', 'ô'=>'o', 'ď'=>'d', 'ḟ'=>'f', 'ë'=>'e',
213 'š'=>'s', 'ơ'=>'o', 'ß'=>'ss', 'ă'=>'a', 'ř'=>'r',
214 'ț'=>'t', 'ň'=>'n', 'ā'=>'a', 'ķ'=>'k', 'ŝ'=>'s',
215 'ỳ'=>'y', 'ņ'=>'n', 'ĺ'=>'l', 'ħ'=>'h', 'ṗ'=>'p',
216 'ó'=>'o', 'ú'=>'u', 'ě'=>'e', 'é'=>'e', 'ç'=>'c',
217 'ẁ'=>'w', 'ċ'=>'c', 'õ'=>'o', 'ṡ'=>'s', 'ø'=>'o',
218 'ģ'=>'g', 'ŧ'=>'t', 'ș'=>'s', 'ė'=>'e', 'ĉ'=>'c',
219 'ś'=>'s', 'î'=>'i', 'ű'=>'u', 'ć'=>'c', 'ę'=>'e',
220 'ŵ'=>'w', 'ṫ'=>'t', 'ū'=>'u', 'č'=>'c', 'ö'=>'oe',
221 'è'=>'e', 'ŷ'=>'y', 'ą'=>'a', 'ł'=>'l', 'ų'=>'u',
222 'ů'=>'u', 'ş'=>'s', 'ğ'=>'g', 'ļ'=>'l', 'ƒ'=>'f',
223 'ž'=>'z', 'ẃ'=>'w', 'ḃ'=>'b', 'å'=>'a', 'ì'=>'i',
224 'ï'=>'i', 'ḋ'=>'d', 'ť'=>'t', 'ŗ'=>'r', 'ä'=>'ae',
225 'í'=>'i', 'ŕ'=>'r', 'ê'=>'e', 'ü'=>'ue', 'ò'=>'o',
226 'ē'=>'e', 'ñ'=>'n', 'ń'=>'n', 'ĥ'=>'h', 'ĝ'=>'g',
227 'đ'=>'d', 'ĵ'=>'j', 'ÿ'=>'y', 'ũ'=>'u', 'ŭ'=>'u',
228 'ư'=>'u', 'ţ'=>'t', 'ý'=>'y', 'ő'=>'o', 'â'=>'a',
229 'ľ'=>'l', 'ẅ'=>'w', 'ż'=>'z', 'ī'=>'i', 'ã'=>'a',
230 'ġ'=>'g', 'ṁ'=>'m', 'ō'=>'o', 'ĩ'=>'i', 'ù'=>'u',
231 'į'=>'i', 'ź'=>'z', 'á'=>'a', 'û'=>'u', 'þ'=>'th',
232 'ð'=>'dh', 'æ'=>'ae', 'µ'=>'u', 'ĕ'=>'e',
233 );
234 return strtr($string, $map);
235 }
236
237 /**
238 * Convert a string to a list of characters.
239 *
240 * @param string utf-8 encoded string.
241 * @return array Characters.
242 */
243 public static function stringToChars($string)
244 {
245 $chars = array();
246 $strlen = mb_strlen($string, 'UTF-8');
247 for ($i=0;$i<$strlen;$i++) {
248 $chars[] = mb_substr($string,$i, 1, 'UTF-8');
249 }
250 return $chars;
251 }
252
253 /**
254 * Prevent a string to be all uppercase.
255 *
256 * If more than 50% of the words in the string are uppercases and
257 * if the string contains more than one word, the string is
258 * converted using the mb_convert_case.
259 *
260 * @see http://www.php.net/mb_convert_case
261 *
262 * @param string String to test.
263 * @param int Mode to convert the string (MB_CASE_TITLE)
264 * @return string Cleaned string.
265 */
266 public static function preventUpperCase($string, $mode=MB_CASE_TITLE)
267 {
268 $elts = mb_split(' ', $string);
269 $n_elts = count($elts);
270 if ($n_elts > 1) {
271 $tot = 0;
272 foreach ($elts as $elt) {
273 if ($elt == '') {
274 $n_elts--;
275 continue;
276 }
277 if ($elt == mb_strtoupper($elt, 'UTF-8')) {
278 $tot++;
279 }
280 }
281 if ( (float) $tot / (float) $n_elts >= 0.5) {
282 return mb_convert_case(mb_strtolower($string, 'UTF-8'),
283 $mode, 'UTF-8');
284 }
285 }
286 return $string;
287 }
288
289 /**
290 * Simple uppercase prevention.
291 *
292 * Contrary to self::preventUpperCase, this method will also
293 * prevent a single word to be uppercase.
294 *
295 * @param string String possibly in uppercase.
296 * @param int Mode to convert the string (MB_CASE_TITLE)
297 * @return string Mode cased if all uppercase in input.
298 */
299 public static function simplePreventUpperCase($string, $mode=MB_CASE_TITLE)
300 {
301 if ($string == mb_strtoupper($string)) {
302 return mb_convert_case(mb_strtolower($string), $mode, 'UTF-8');
303 }
304 return $string;
305 }
306}
307

Archive Download this file

Branches

Tags