| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243 |
- <?php
- // +----------------------------------------------------------------------
- // | Fanwe 方维直播系统
- // +----------------------------------------------------------------------
- // | Copyright (c) 2011 http://www.fanwe.com All rights reserved.
- // +----------------------------------------------------------------------
- // | Author: 云淡风轻(1956838968@qq.com)
- // +----------------------------------------------------------------------
- class words
- {
- /**
- * 文本分词
- * @param string $text 需要分词的文本
- * @param int $num 返回分词数量
- * @return array
- */
- public static function segment($text,$num = 10)
- {
- $list = array();
- if(empty($text))
- return $list;
-
- //检测是否已安装php_scws扩展
- if(function_exists("scws_open"))
- {
- $sh = scws_open();
- scws_set_charset($sh,'utf8');
- scws_set_dict($sh,APP_ROOT_PATH.'system/scws/dict.utf8.xdb');
- scws_set_rule($sh,APP_ROOT_PATH.'system/scws/rules.utf8.ini');
- scws_set_ignore($sh,true);
- scws_send_text($sh, $text);
- $words = scws_get_tops($sh, $num);
- scws_close($sh);
- }
- else
- {
- require_once APP_ROOT_PATH.'system/scws/pscws4.class.php';
- $pscws = new PSCWS4();
- $pscws->set_dict(APP_ROOT_PATH.'system/scws/dict.utf8.xdb');
- $pscws->set_rule(APP_ROOT_PATH.'system/scws/rules.utf8.ini');
- $pscws->set_ignore(true);
- $pscws->send_text($text);
- $words = $pscws->get_tops($num);
- $pscws->close();
- }
-
- foreach($words as $word)
- {
- $list[] = $word['word'];
- }
-
- return $list;
- }
-
- public static function segments($arr,$num = 10)
- {
- $list = array();
- if(empty($text))
- return $list;
-
- $words = array();
-
- //检测是否已安装php_scws扩展
- if(function_exists("scws_open"))
- {
- $sh = scws_open();
- scws_set_charset($sh,'utf8');
- scws_set_dict($sh,APP_ROOT_PATH.'system/scws/dict.utf8.xdb');
- scws_set_rule($sh,APP_ROOT_PATH.'system/scws/rules.utf8.ini');
- scws_set_ignore($sh,true);
- foreach($arr as $key => $text)
- {
- scws_send_text($sh, $text);
- $words[] = scws_get_tops($sh, $num);
- }
- scws_close($sh);
- }
- else
- {
- require_once APP_ROOT_PATH.'system/scws/pscws4.class.php';
- $pscws = new PSCWS4();
- $pscws->set_dict(APP_ROOT_PATH.'system/scws/dict.utf8.xdb');
- $pscws->set_rule(APP_ROOT_PATH.'system/scws/rules.utf8.ini');
- $pscws->set_ignore(true);
- foreach($arr as $key => $text)
- {
- $pscws->send_text($text);
- $words[] = $pscws->get_tops($num);
- }
- $pscws->close();
- }
-
- for($i = 0;$i < $num; $i++)
- {
- foreach($words as $item)
- {
- if(isset($item[$i]))
- {
- $word = $item[$i]['word'];
- if(isset($list[$word]))
- $list[$word]++;
- else
- $list[$word] = 1;
- }
- }
- }
-
- $list = array_slice($list,0,$num);
- return array_keys($list);
- }
-
-
- /**
- * 文本分词
- * @param string $text 需要分词的文本
- * @return array
- */
- public static function segmentAll($text) {
- $list = array ();
- if(empty($text)){
- return $list;
- }
- //检测是否已安装php_scws扩展
- if (function_exists("scws_open")){
- $sh = scws_open();
- scws_set_charset($sh, 'utf8');
- scws_set_dict($sh, APP_ROOT_PATH.'system/scws/dict.utf8.xdb');
- scws_set_rule($sh, APP_ROOT_PATH.'system/rules.utf8.ini');
- scws_set_ignore($sh, true);
- scws_send_text($sh, $text);
- while ($words = scws_get_result($sh)){
- foreach ($words as $word){
- $list[] = $word['word'];
- }
- }
- scws_close($sh);
- }else{
- require_once APP_ROOT_PATH.'system/scws/pscws4.class.php';
- $pscws = new PSCWS4();
- $pscws->set_dict(APP_ROOT_PATH.'system/scws/dict.utf8.xdb');
- $pscws->set_rule(APP_ROOT_PATH.'system/scws/rules.utf8.ini');
- $pscws->set_ignore(true);
- $pscws->send_text($text);
- while ($words = $pscws->get_result()){
- foreach ($words as $word){
- $list[] = $word['word'];
- }
- }
- $pscws->close();
- }
- return $list;
- }
-
- /**
- * utf8字符串分隔为unicode字符串
- * @param string $str 要转换的字符串
- * @param string $pre
- * @return string
- */
- public static function segmentToUnicode($str, $pre = ''){
- $str = strtolower($str);
- $arr = array ();
- $temps = self::segmentAll($str);
- foreach ($temps as $word) {
- $temp = $pre;
- $str_len = mb_strlen($word, 'UTF-8');
- for ($i = 0; $i < $str_len; $i++){
- $s = mb_substr($word, $i, 1, 'UTF-8');
- if ($s != ' ' && $s != ' '){
- $temp .= 'ux'.self::utf8ToUnicode($s);
- }
- }
- $arr[] = $temp;
- }
- $str = self::clearSymbol($str);
- $str_len = mb_strlen($str, 'UTF-8');
- for ($i = 0; $i < $str_len; $i++){
- $s = mb_substr($str, $i, 1, 'UTF-8');
- if ($s != ' ' && $s != ' '){
- $arr[] = $pre.'ux'.self::utf8ToUnicode($s);
- }
- }
- $arr = array_unique($arr);
- return implode(' ', $arr);
- }
-
- public static function strToUnicode($str, $depart = ''){
- $str = self::clearSymbol(strtolower($str));
- $arr = array();
- $str_len = mb_strlen($str,'utf-8');
- for($i = 0;$i < $str_len;$i++){
- $s = mb_substr($str,$i,1,'utf-8');
- if($s != ' ' && $s != ' '){
- $arr[] = $depart.'ux'.self::utf8ToUnicode($s);
- }
- }
- return implode(' ',$arr);
- }
-
- /**
- * 清除符号
- * @param string $str 要清除符号的字符串
- * @return string
- */
- public static function clearSymbol($str){
- static $symbols = null;
- if($symbols === null){
- $symbols = file_get_contents(APP_ROOT_PATH.'system/table/symbol.table');
- $symbols = explode("\r\n",$symbols);
- }
- return str_replace($symbols,'',$str);
- }
-
- /**
- * utf8字符转Unicode字符
- * @param string $char 要转换的单字符
- * @return void
- */
- public static function utf8ToUnicode($char){
- switch(strlen($char)){
- case 1:
- return ord($char);
- case 2:
- $n = (ord($char[0]) & 0x3f) << 6;
- $n += ord($char[1]) & 0x3f;
- return $n;
- case 3:
- $n = (ord($char[0]) & 0x1f) << 12;
- $n += (ord($char[1]) & 0x3f) << 6;
- $n += ord($char[2]) & 0x3f;
- return $n;
- case 4:
- $n = (ord($char[0]) & 0x0f) << 18;
- $n += (ord($char[1]) & 0x3f) << 12;
- $n += (ord($char[2]) & 0x3f) << 6;
- $n += ord($char[3]) & 0x3f;
- return $n;
- }
- }
- }
- ?>
|