验证中...
码云 Gitee IDE 全新上线——支持 Git 管理的轻量在线编码环境
语言: PHP
分类: 其他
最后更新于 2018-08-10 16:19
search.php
原始数据 复制代码
<?php
/**
* Demo search
* User: eagle<eaglewudi@gmail.com>
* Date: 2018/8/10
* Time: 10:41
*/
$words = [
"杭州", "杭州市", "上海", "上海市", "闵行", "闵行区", "莲花路",
"杭州路", "淮海路", "淮海中路",
"人民广场",
"号", "幢", "座", "楼", "弄", "路", "市", "省",
"东路", "南路", "西路", "北路", "中路", "交叉口", "路口",
"小区", "花园", "苑",
];
$wordsMap = [];
$maxStep = 1;
$synonyms = [
[""]
];
$stopwords = [
"此", "此间", "此外", "从", "从而", "打", "待", "但",
"但是", "当", "当着", "到", "得", "的", "的话", "等", "等等", "地"
];
$stopWordsMap = [];
// 杭州路18号 -> 杭州、杭州路、18、号
$sentences = [
];
$indices = [
// word index -> [sentence index]
// '杭州' => [1],
// '杭州路' => [1],
// '号' => [1, 2],
// '18' => [1],
// '19' => [2],
];
function buildIndex($sentence, $index, $maxStep)
{
global $indices;
$words = [];
for ($step = 1; $step <= $maxStep; $step++) {
foreach (splitByWords($sentence, $step) as $word)
$words[] = $word;
}
foreach (array_unique($words) as $word) {
$indices[$word][] = $index;
}
}
function hashIndex($str)
{
return crc32($str);
}
function splitByWords($str, $step)
{
global $stopWordsMap, $wordsMap;
$words = [];
$t = mb_strlen($str);
for ($i = 0; $i < $t; $i++) {
$word = mb_substr($str, $i, $step);
if (is_numeric($word)) {
$words[] = $word;
continue;
}
if (!isset($stopWordsMap[$word]) && isset($wordsMap[$word])) {
$words[] = $word;
}
}
return $words;
}
function sortWords(array &$words)
{
usort($words, function ($a, $b) {
if (strlen($a) == strlen($b))
return 0;
return strlen($a) > strlen($b) ? -1 : 1;
});
}
function maxStep(array &$words)
{
$max = 1;
array_walk($words, function ($v) use (&$max) {
$len = mb_strlen($v);
$max = $len > $max ? $len : $max;
});
return $max;
}
function search($sentence)
{
global $maxStep, $indices;
$words = [];
for ($step = 1; $step <= $maxStep; $step++) {
foreach (splitByWords($sentence, $step) as $word)
$words[] = $word;
}
$ids = [];
foreach (array_unique($words) as $word) {
if (isset($indices[$word])) {
foreach ($indices[$word] as $sid) {
if (!isset($ids[$sid]))
$ids[$sid] = 0;
$ids[$sid]++;
}
}
}
return $ids;
}
$maxStep = maxStep($words);
$wordsMap = array_flip($words);
$stopwordsMap = array_flip($stopwords);
foreach ($sentences as $index => $sentence) {
buildIndex($sentence, $index, $maxStep);
}
echo <<<DOC
指令列表:
add 句子: 增加待索引语句
search 句子: 查询与该语句相似度较高的语句列表(权重高优先)
fc 句子: 测试分词结果
======================================================\n
DOC;
while (1) {
echo "$ ";
try {
$buffer = trim(fgets(STDIN, 1024));
if (preg_match('/^add \s*([^\n]+)/i', $buffer, $matches)) {
$sentence = $matches[1];
$sentences[] = $sentence;
buildIndex($sentence, count($sentences) - 1, $maxStep);
echo "[OK]: '$sentence'\n";
continue;
}
if (preg_match('/^search \s*([^\n]+)/i', $buffer, $matches)) {
if ($sids = search($matches[1])) {
arsort($sids);
foreach ($sids as $id => $weight)
echo sprintf("[Weight:%d] %s\n", $weight, $sentences[$id]);
} else {
echo "Not Found\n";
}
continue;
}
if (preg_match('/^fc \s*([^\n]+)/i', $buffer, $matches)) {
$words = [];
for ($step = 1; $step <= $maxStep; $step++) {
foreach (splitByWords($matches[1], $step) as $word)
$words[] = $word;
}
echo implode("、", array_unique($words)) . "\n";
continue;
}
if (!empty($buffer))
echo "Invalid command\n";
} catch (Exception $e) {
echo "ERROR: {$e->getMessage()}\n";
}
}

评论列表( 1 )

827_pushy
D哥 2018-08-10 16:19
→ php search.php                                                                                                              [abbf743f]
指令列表:
    add 句子:     增加待索引语句
    search 句子:  查询与该语句相似度较高的语句列表(权重高优先)
    fc 句子:      测试分词结果
======================================================
$
$
$
$ add 上海市淮海中路131号
[OK]: '上海市淮海中路131号'
$ add 上海市四川中路21号
[OK]: '上海市四川中路21号'
$ add 杭州路18号
[OK]: '杭州路18号'
$ add 杭州路19号
[OK]: '杭州路19号'
$
$
$ search 上海市淮海中路145号
[Weight:8] 上海市淮海中路131号
[Weight:7] 上海市四川中路21号
[Weight:3] 杭州路18号
[Weight:3] 杭州路19号
$
$
$ search 杭州路18楼
[Weight:6] 杭州路18号
[Weight:4] 杭州路19号
[Weight:2] 上海市淮海中路131号
[Weight:2] 上海市四川中路21号
$
$ fc 从杭州路18号与淮海中路路口向南罗马小区
路、1、8、号、杭州、18、中路、路口、小区、杭州路、淮海中路
$

你可以在登录后,发表评论

搜索帮助