words.php 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243
  1. <?php
  2. // +----------------------------------------------------------------------
  3. // | Fanwe 方维直播系统
  4. // +----------------------------------------------------------------------
  5. // | Copyright (c) 2011 http://www.fanwe.com All rights reserved.
  6. // +----------------------------------------------------------------------
  7. // | Author: 云淡风轻(1956838968@qq.com)
  8. // +----------------------------------------------------------------------
  9. class words
  10. {
  11. /**
  12. * 文本分词
  13. * @param string $text 需要分词的文本
  14. * @param int $num 返回分词数量
  15. * @return array
  16. */
  17. public static function segment($text,$num = 10)
  18. {
  19. $list = array();
  20. if(empty($text))
  21. return $list;
  22. //检测是否已安装php_scws扩展
  23. if(function_exists("scws_open"))
  24. {
  25. $sh = scws_open();
  26. scws_set_charset($sh,'utf8');
  27. scws_set_dict($sh,APP_ROOT_PATH.'system/scws/dict.utf8.xdb');
  28. scws_set_rule($sh,APP_ROOT_PATH.'system/scws/rules.utf8.ini');
  29. scws_set_ignore($sh,true);
  30. scws_send_text($sh, $text);
  31. $words = scws_get_tops($sh, $num);
  32. scws_close($sh);
  33. }
  34. else
  35. {
  36. require_once APP_ROOT_PATH.'system/scws/pscws4.class.php';
  37. $pscws = new PSCWS4();
  38. $pscws->set_dict(APP_ROOT_PATH.'system/scws/dict.utf8.xdb');
  39. $pscws->set_rule(APP_ROOT_PATH.'system/scws/rules.utf8.ini');
  40. $pscws->set_ignore(true);
  41. $pscws->send_text($text);
  42. $words = $pscws->get_tops($num);
  43. $pscws->close();
  44. }
  45. foreach($words as $word)
  46. {
  47. $list[] = $word['word'];
  48. }
  49. return $list;
  50. }
  51. public static function segments($arr,$num = 10)
  52. {
  53. $list = array();
  54. if(empty($text))
  55. return $list;
  56. $words = array();
  57. //检测是否已安装php_scws扩展
  58. if(function_exists("scws_open"))
  59. {
  60. $sh = scws_open();
  61. scws_set_charset($sh,'utf8');
  62. scws_set_dict($sh,APP_ROOT_PATH.'system/scws/dict.utf8.xdb');
  63. scws_set_rule($sh,APP_ROOT_PATH.'system/scws/rules.utf8.ini');
  64. scws_set_ignore($sh,true);
  65. foreach($arr as $key => $text)
  66. {
  67. scws_send_text($sh, $text);
  68. $words[] = scws_get_tops($sh, $num);
  69. }
  70. scws_close($sh);
  71. }
  72. else
  73. {
  74. require_once APP_ROOT_PATH.'system/scws/pscws4.class.php';
  75. $pscws = new PSCWS4();
  76. $pscws->set_dict(APP_ROOT_PATH.'system/scws/dict.utf8.xdb');
  77. $pscws->set_rule(APP_ROOT_PATH.'system/scws/rules.utf8.ini');
  78. $pscws->set_ignore(true);
  79. foreach($arr as $key => $text)
  80. {
  81. $pscws->send_text($text);
  82. $words[] = $pscws->get_tops($num);
  83. }
  84. $pscws->close();
  85. }
  86. for($i = 0;$i < $num; $i++)
  87. {
  88. foreach($words as $item)
  89. {
  90. if(isset($item[$i]))
  91. {
  92. $word = $item[$i]['word'];
  93. if(isset($list[$word]))
  94. $list[$word]++;
  95. else
  96. $list[$word] = 1;
  97. }
  98. }
  99. }
  100. $list = array_slice($list,0,$num);
  101. return array_keys($list);
  102. }
  103. /**
  104. * 文本分词
  105. * @param string $text 需要分词的文本
  106. * @return array
  107. */
  108. public static function segmentAll($text) {
  109. $list = array ();
  110. if(empty($text)){
  111. return $list;
  112. }
  113. //检测是否已安装php_scws扩展
  114. if (function_exists("scws_open")){
  115. $sh = scws_open();
  116. scws_set_charset($sh, 'utf8');
  117. scws_set_dict($sh, APP_ROOT_PATH.'system/scws/dict.utf8.xdb');
  118. scws_set_rule($sh, APP_ROOT_PATH.'system/rules.utf8.ini');
  119. scws_set_ignore($sh, true);
  120. scws_send_text($sh, $text);
  121. while ($words = scws_get_result($sh)){
  122. foreach ($words as $word){
  123. $list[] = $word['word'];
  124. }
  125. }
  126. scws_close($sh);
  127. }else{
  128. require_once APP_ROOT_PATH.'system/scws/pscws4.class.php';
  129. $pscws = new PSCWS4();
  130. $pscws->set_dict(APP_ROOT_PATH.'system/scws/dict.utf8.xdb');
  131. $pscws->set_rule(APP_ROOT_PATH.'system/scws/rules.utf8.ini');
  132. $pscws->set_ignore(true);
  133. $pscws->send_text($text);
  134. while ($words = $pscws->get_result()){
  135. foreach ($words as $word){
  136. $list[] = $word['word'];
  137. }
  138. }
  139. $pscws->close();
  140. }
  141. return $list;
  142. }
  143. /**
  144. * utf8字符串分隔为unicode字符串
  145. * @param string $str 要转换的字符串
  146. * @param string $pre
  147. * @return string
  148. */
  149. public static function segmentToUnicode($str, $pre = ''){
  150. $str = strtolower($str);
  151. $arr = array ();
  152. $temps = self::segmentAll($str);
  153. foreach ($temps as $word) {
  154. $temp = $pre;
  155. $str_len = mb_strlen($word, 'UTF-8');
  156. for ($i = 0; $i < $str_len; $i++){
  157. $s = mb_substr($word, $i, 1, 'UTF-8');
  158. if ($s != ' ' && $s != ' '){
  159. $temp .= 'ux'.self::utf8ToUnicode($s);
  160. }
  161. }
  162. $arr[] = $temp;
  163. }
  164. $str = self::clearSymbol($str);
  165. $str_len = mb_strlen($str, 'UTF-8');
  166. for ($i = 0; $i < $str_len; $i++){
  167. $s = mb_substr($str, $i, 1, 'UTF-8');
  168. if ($s != ' ' && $s != ' '){
  169. $arr[] = $pre.'ux'.self::utf8ToUnicode($s);
  170. }
  171. }
  172. $arr = array_unique($arr);
  173. return implode(' ', $arr);
  174. }
  175. public static function strToUnicode($str, $depart = ''){
  176. $str = self::clearSymbol(strtolower($str));
  177. $arr = array();
  178. $str_len = mb_strlen($str,'utf-8');
  179. for($i = 0;$i < $str_len;$i++){
  180. $s = mb_substr($str,$i,1,'utf-8');
  181. if($s != ' ' && $s != ' '){
  182. $arr[] = $depart.'ux'.self::utf8ToUnicode($s);
  183. }
  184. }
  185. return implode(' ',$arr);
  186. }
  187. /**
  188. * 清除符号
  189. * @param string $str 要清除符号的字符串
  190. * @return string
  191. */
  192. public static function clearSymbol($str){
  193. static $symbols = null;
  194. if($symbols === null){
  195. $symbols = file_get_contents(APP_ROOT_PATH.'system/table/symbol.table');
  196. $symbols = explode("\r\n",$symbols);
  197. }
  198. return str_replace($symbols,'',$str);
  199. }
  200. /**
  201. * utf8字符转Unicode字符
  202. * @param string $char 要转换的单字符
  203. * @return void
  204. */
  205. public static function utf8ToUnicode($char){
  206. switch(strlen($char)){
  207. case 1:
  208. return ord($char);
  209. case 2:
  210. $n = (ord($char[0]) & 0x3f) << 6;
  211. $n += ord($char[1]) & 0x3f;
  212. return $n;
  213. case 3:
  214. $n = (ord($char[0]) & 0x1f) << 12;
  215. $n += (ord($char[1]) & 0x3f) << 6;
  216. $n += ord($char[2]) & 0x3f;
  217. return $n;
  218. case 4:
  219. $n = (ord($char[0]) & 0x0f) << 18;
  220. $n += (ord($char[1]) & 0x3f) << 12;
  221. $n += (ord($char[2]) & 0x3f) << 6;
  222. $n += ord($char[3]) & 0x3f;
  223. return $n;
  224. }
  225. }
  226. }
  227. ?>