百度dict 采集样本
写的采集百度dict词典翻译后的所有结果数据,当然附带了13.5w单词库和采集简单的案例,这里我把写出的主要类dict.class.php放出来,项目地址http://github.com/widuu/baidu_dict,有需要的直接fork就可以了~么么哒,这东西用的人很少,所以有用的兄弟拿走了哈~
<?php /** * dict.class.php 采集百度词典翻译内容 * * @copyright (C) 2014 widuu * @license http://www.widuu.com * @lastmodify 2014-2-15 */ header(\"content-type:text/html;charset=utf8\"); class Dict{ private $word; //显示的条数 private static $num = 10; public function __construct(){} /** * 公用返回百度采集数据的方法 * @param string 英文单词 * retun array( * symbol\" => 音标 * \"pro\" => 发音 * \"example\"=> 例句 * \"explain\"=> 简明释义 * \"synonym\"=> 同反义词 * \"phrase\" => 短语数组 * ) * */ public function content($word){ $this -> word = $word; $symbol = $this -> Pronounced(); $pro = $this->getSay(); $example = $this -> getExample(); $explain = $this -> getExplain(); $synonym = $this -> getSynonym(); $phrase = $this -> getPhrase(); $result = array( \"symbol\" => $symbol, //音标 \"pro\" => $pro, //发音 \"example\"=> $example, //例句 \"explain\"=> $explain, //简明释义 \"synonym\"=> $synonym, //同反义词 \"phrase\" => $phrase //短语数组 ); return $result; } /** * 远程获取百度翻译内容 * get function curl * retun string * */ private function getContent(){ $useragent = \"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0\"; $ch = curl_init(); $url = \"http://dict.baidu.com/s?wd=\".$this->word; curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_USERAGENT,$useragent); curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1); curl_setopt($ch, CURLOPT_HTTPGET, 1); curl_setopt($ch, CURLOPT_AUTOREFERER,1); curl_setopt($ch, CURLOPT_HEADER, 0); curl_setopt($ch, CURLOPT_TIMEOUT, 30); $result = curl_exec($ch); if (curl_errno($curl)) { echo \'Errno\'.curl_error($curl); } curl_close($ch); return $result; } /** * 获取百度翻译发音 * retun array(英,美) * */ private function Pronounced(){ $data = $this -> getContent(); preg_match_all(\"/\\\"EN\\-US\\\"\\>(.*)\\<\\/b\\>/Ui\",$data,$pronounced); return array( \'en\' => $pronounced[1][0], \'us\' => $pronounced[1][1] ); } /** * 获取百度翻译发音 * return array(英,美) * */ private function getSay(){ $data = $this -> getContent(); preg_match_all(\"/url=\\\"(.*)\\\"/Ui\",$data,$pronounced); return array( \'en\' => $pronounced[1][0], \'us\' => $pronounced[1][1] ); } /** * 获取百度翻译例句 * return array() 多维数组 例句 * */ private function getExample(){ $str = \"\"; $data = $this -> getContent(); preg_match_all(\"/var example_data = (.*)\\]\\;/Us\",$data,$example); $data1 = \"[[[\".ltrim($example[1][0],\"[\"); $data2 = explode(\"[[[\",$data1); $num = count(array_filter($data2)); foreach($data2 as $key => $value){ $data3 = explode(\"[[\",\"[[\".$value); foreach ($data3 as $k => $v) { preg_match_all(\"/\\[\\\"(.*)\\\",/Us\",\"[\".$v, $match); if(!empty($match[1])){ $str .= implode($match[1],\" \").\"@\"; } } } $data4 = trim($str,\"@\"); $data5 = explode(\"@\", $data4); $result = array_chunk($data5, 2); return $result; } /** * 获取简明释义 * return array (x => \"词性\",b => \"附属\") * **/ private function getExplain(){ $data = $this -> getContent(); preg_match_all(\"/id\\=\\\"en\\-simple\\-means\\\"\\>(.*)\\<div(\\s+)class\\=\\\"source\\\"\\>/Us\",$data,$explain); $r_data = $explain[1][0]; preg_match_all(\"/\\<p\\>\\<strong\\>(?P<adj>.*)\\<\\/strong\\>\\<span\\>(?P<name>.*)\\<\\/span\\>\\<\\/p\\>/Us\", $r_data, $a_data); preg_match_all(\"/\\<span\\>(?P<tag>[^\\>]+)\\:\\<a(\\s+)href\\=\\\"(.*)\\\"\\>(?P<word>.*)\\<\\/a\\>\\<\\/span\\>/Us\", $r_data, $b_data); $result = array(); foreach ($a_data[\"adj\"] as $key => $value) { $result[$value] = $a_data[\"name\"][$key]; } $word_b = array(); foreach ($b_data[\"tag\"] as $key => $value) { $word_b[$value] = strip_tags($b_data[\"word\"][$key]); } $result_data = array(\"x\" => $result,\"b\" => $word_b); return $result_data; } /** * 获取同义词 * return array(0 => \"同义词\", 1 => \"反义词\") 一般为多维数组 * */ private function getSynonym(){ $data = $this -> getContent(); preg_match_all(\"/id=\\\"en\\-syn\\-ant\\\"\\>(.*)<div(\\s+)class\\=\\\"source\\\">/Us\",$data,$synonym); $content = $synonym[1][0]; $data1 = explode(\"</dl>\", $content); $result = array(); $data2 = array(); foreach ($data1 as $key => $value) { preg_match_all(\"/\\<strong\\>(?P<adj>.*)\\ \\;\\<\\/strong\\>\\<\\/div\\>\\<div(\\s+)class\\=\\\"syn\\-ant\\-list\\\"\\>\\<ul\\>(?<content>.*)\\<\\/ul\\>/Us\", $value, $r_data); $data2[$key][\"adj\"] = $r_data[\"adj\"]; $data2[$key][\"content\"] = $r_data[\"content\"]; } foreach ($data2 as $key => $value) { foreach ($value[\"content\"] as $k => $v) { if(!empty($v)){ preg_match_all(\"/\\<li\\>\\<p\\>(?P<title>.*)\\<\\/p\\>(?P<value>.*)\\<\\/li>/Us\", $v, $v_data); foreach ($v_data[\'title\'] as $m => $d) { $data = strip_tags(preg_replace(\"<</a>>\",\" \", $v_data[\"value\"][$m])); $result[$key][$value[\"adj\"][$k]][$d] = $data; } } } } return $result; } /** * 获取短语词组 * return array (key => value) 一维或者多维数组 * */ private function getPhrase(){ $num = self::$num; $data = $this -> getContent(); preg_match_all(\"/id=\\\"en\\-phrase\\\"\\>(.*)\\<div class\\=\\\"source\\\"\\>/Us\",$data,$phrase); $data = explode(\"</dd>\",$phrase[1][0]); $data1 = array_slice($data,0,$num); $result = array(); foreach ($data1 as $key => $value) { $data2 = explode(\"</p>\", $value); $n = count($data2); if($n<=3){ $result[str_replace(\" \",\"\",strip_tags($data2[0]))] = strip_tags($data2[1]); }else{ $data3 = array_slice($data2,0,$n-1); $data4 = array_slice($data2,0,2); $res = array_diff($data3,$data4); $data5 = array_chunk($res,2); $key_value = trim(str_replace(\" \",\"\",strip_tags($data4[0]))); $result[$key_value] = strip_tags($data4[1]); foreach ($data5 as $key => $value) { foreach ($value as $k => $v) { $value[$k] = strip_tags($v); } $array = array($result[$key_value],$value); if (array_key_exists($key_value, $result)){ $result[$key_value] = $array; } } } } return $result; } /** * 将数组转换为字符串 * * @param array $data 数组 * @param bool $isformdata 如果为0,则不使用new_stripslashes处理,可选参数,默认为1 * @return string 返回字符串,如果,data为空,则返回空 */ private function array2string($data, $isformdata = 1) { if($data == \'\') return \'\'; if($isformdata) $data = $this->new_stripslashes($data); return addslashes(var_export($data, TRUE)); } /** * 返回经stripslashes处理过的字符串或数组 * @param $string 需要处理的字符串或数组 * @return mixed */ private function new_stripslashes($string) { if(!is_array($string)) return stripslashes($string); foreach($string as $key => $val) $string[$key] = $this->new_stripslashes($val); return $string; } } // $word = new dict(\"express\"); // $word ->content();
以上就是本文的全部内容了,非常实用的功能,希望小伙伴们能够喜欢。
本文地址:https://www.stayed.cn/item/10461
转载请注明出处。
本站部分内容来源于网络,如侵犯到您的权益,请 联系我