PHP制作百度词典查词采集器

前端技术 2023/09/04 PHP

百度dict 采集样本

写的采集百度dict词典翻译后的所有结果数据，当然附带了13.5w单词库和采集简单的案例，这里我把写出的主要类dict.class.php放出来，项目地址http://github.com/widuu/baidu_dict，有需要的直接fork就可以了~么么哒，这东西用的人很少，所以有用的兄弟拿走了哈~

<?php
/**
 * dict.class.php 采集百度词典翻译内容
 *
 * @copyright      (C) 2014 widuu
 * @license       http://www.widuu.com
 * @lastmodify     2014-2-15
 */
 
 
header(\"content-type:text/html;charset=utf8\");
class Dict{

	private $word;
	
	//显示的条数
	private static $num = 10;

	public function __construct(){}
	
	
	/**
   * 公用返回百度采集数据的方法
   * @param string 英文单词
   * retun array(
	 *				symbol\" => 音标
	 *				\"pro\"	 => 发音
	 *				\"example\"=> 例句
	 *				\"explain\"=> 简明释义
	 *				\"synonym\"=> 同反义词
	 *				\"phrase\" => 短语数组
	 *			)
   *
	 */
	public function content($word){
		 $this -> word = $word;
		 $symbol = $this -> Pronounced();
		 $pro	 = $this->getSay();
		 $example = $this -> getExample();
		 $explain = $this -> getExplain();
		 $synonym = $this -> getSynonym();
		 $phrase = $this -> getPhrase();
		 $result = array(
				\"symbol\" => $symbol,		//音标
				\"pro\"	 => $pro,			//发音
				\"example\"=> $example,		//例句
				\"explain\"=> $explain,		//简明释义
				\"synonym\"=> $synonym,		//同反义词
				\"phrase\" => $phrase 		//短语数组
			);
		return $result;
	}


	/**
   * 远程获取百度翻译内容
   * get function curl
   * retun string
   *
	 */

	private function getContent(){
 		$useragent = \"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0\";
 		$ch = curl_init();
 		$url = \"http://dict.baidu.com/s?wd=\".$this->word;
 		curl_setopt($ch, CURLOPT_URL, $url);
 		curl_setopt($ch, CURLOPT_USERAGENT,$useragent);
		curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE); 
		curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1); 
		curl_setopt($ch, CURLOPT_HTTPGET, 1);
		curl_setopt($ch, CURLOPT_AUTOREFERER,1);
		curl_setopt($ch, CURLOPT_HEADER, 0); 
		curl_setopt($ch, CURLOPT_TIMEOUT, 30);
		$result = curl_exec($ch);
		if (curl_errno($curl)) {
			echo \'Errno\'.curl_error($curl);
		}
		curl_close($ch);
		return $result;
	}


	/**
   * 获取百度翻译发音
   * retun array(英，美)
   *
	 */

	private function Pronounced(){
		$data = $this -> getContent();
		preg_match_all(\"/\\\"EN\\-US\\\"\\>(.*)\\<\\/b\\>/Ui\",$data,$pronounced);
		return array(
			\'en\' => $pronounced[1][0],
			\'us\' => $pronounced[1][1]
		);
	}

	/**
	 * 获取百度翻译发音
	 * return array(英，美)
	 *
	 */

	private function getSay(){
		$data = $this -> getContent();
		preg_match_all(\"/url=\\\"(.*)\\\"/Ui\",$data,$pronounced);
		return array(
			\'en\' => $pronounced[1][0],
			\'us\' => $pronounced[1][1]
		);	
	}

	/**
   * 获取百度翻译例句
   * return array() 多维数组 例句
   * 
	 */

	private function getExample(){
		$str = \"\";
		$data = $this -> getContent();
		preg_match_all(\"/var example_data = (.*)\\]\\;/Us\",$data,$example);
	  $data1 = \"[[[\".ltrim($example[1][0],\"[\");
	  $data2 = explode(\"[[[\",$data1);
	  $num = count(array_filter($data2));
		foreach($data2 as $key => $value){
		 	$data3 = explode(\"[[\",\"[[\".$value);
		 	foreach ($data3 as $k => $v) {
		 		preg_match_all(\"/\\[\\\"(.*)\\\",/Us\",\"[\".$v, $match);
		 		if(!empty($match[1])){
		 			$str .= implode($match[1],\" \").\"@\";
		 		}
		 	}
		}
		$data4 = trim($str,\"@\");
		$data5 = explode(\"@\", $data4);
		$result = array_chunk($data5, 2);
		return $result;
	}

	/**
   * 获取简明释义
   * return array (x => \"词性\"，b => \"附属\")
   * 
	 **/

	private function getExplain(){
		$data = $this -> getContent();
		preg_match_all(\"/id\\=\\\"en\\-simple\\-means\\\"\\>(.*)\\<div(\\s+)class\\=\\\"source\\\"\\>/Us\",$data,$explain);
		$r_data = $explain[1][0];
		preg_match_all(\"/\\<p\\>\\<strong\\>(?P<adj>.*)\\<\\/strong\\>\\<span\\>(?P<name>.*)\\<\\/span\\>\\<\\/p\\>/Us\", $r_data, $a_data);
		preg_match_all(\"/\\<span\\>(?P<tag>[^\\>]+)\\：\\<a(\\s+)href\\=\\\"(.*)\\\"\\>(?P<word>.*)\\<\\/a\\>\\<\\/span\\>/Us\", $r_data, $b_data);
		
		$result = array();
		foreach ($a_data[\"adj\"] as $key => $value) {
			$result[$value] = $a_data[\"name\"][$key];
		}
		
		$word_b = array();
		foreach ($b_data[\"tag\"] as $key => $value) {
			$word_b[$value] = strip_tags($b_data[\"word\"][$key]);
		}
		
		$result_data = array(\"x\" => $result,\"b\" => $word_b);

 		return $result_data;
	}


	/**
   * 获取同义词
   * return array(0 => \"同义词\", 1 => \"反义词\") 一般为多维数组
   * 
	 */

	private function getSynonym(){
		$data = $this -> getContent();
		preg_match_all(\"/id=\\\"en\\-syn\\-ant\\\"\\>(.*)<div(\\s+)class\\=\\\"source\\\">/Us\",$data,$synonym);
		$content = $synonym[1][0];
		$data1 = explode(\"</dl>\", $content);
		$result = array();
		$data2 = array();
		foreach ($data1 as $key => $value) {
			preg_match_all(\"/\\<strong\\>(?P<adj>.*)\\ \\;\\<\\/strong\\>\\<\\/div\\>\\<div(\\s+)class\\=\\\"syn\\-ant\\-list\\\"\\>\\<ul\\>(?<content>.*)\\<\\/ul\\>/Us\", $value, $r_data);
			$data2[$key][\"adj\"] = $r_data[\"adj\"];
			$data2[$key][\"content\"] = $r_data[\"content\"];
		}

		foreach ($data2 as $key => $value) {
			foreach ($value[\"content\"] as $k => $v) {
				if(!empty($v)){
					preg_match_all(\"/\\<li\\>\\<p\\>(?P<title>.*)\\<\\/p\\>(?P<value>.*)\\<\\/li>/Us\", $v, $v_data);
					foreach ($v_data[\'title\'] as $m => $d) {
						$data = strip_tags(preg_replace(\"<</a>>\",\" \", $v_data[\"value\"][$m]));
						$result[$key][$value[\"adj\"][$k]][$d] = $data;
					}
				}
			}
		}
 		return $result;
	}

	/**
   * 获取短语词组
   * return array (key => value) 一维或者多维数组
   * 
	 */

	private function getPhrase(){
		$num = self::$num;
		$data = $this -> getContent();
		preg_match_all(\"/id=\\\"en\\-phrase\\\"\\>(.*)\\<div class\\=\\\"source\\\"\\>/Us\",$data,$phrase);
		$data = explode(\"</dd>\",$phrase[1][0]);
		$data1 = array_slice($data,0,$num);
		$result = array();
		foreach ($data1 as $key => $value) {
			$data2 = explode(\"</p>\", $value);
			$n = count($data2);
			if($n<=3){
				$result[str_replace(\" \",\"\",strip_tags($data2[0]))] = strip_tags($data2[1]);
			}else{
				$data3 = array_slice($data2,0,$n-1);
				$data4 = array_slice($data2,0,2);
				$res = array_diff($data3,$data4);
				$data5 = array_chunk($res,2);
				$key_value = trim(str_replace(\" \",\"\",strip_tags($data4[0])));
				$result[$key_value] = strip_tags($data4[1]);
				foreach ($data5 as $key => $value) {
					foreach ($value as $k => $v) {
						$value[$k] = strip_tags($v);
					}
					$array = array($result[$key_value],$value);
					if (array_key_exists($key_value, $result)){
						$result[$key_value] = $array;
					}
				}
				
			}
		}
		return $result;
	}

	/**
	 * 将数组转换为字符串
	 *
	 * @param  array  $data    数组
	 * @param  bool  $isformdata 如果为0，则不使用new_stripslashes处理，可选参数，默认为1
	 * @return  string 返回字符串，如果，data为空，则返回空
	 */
	private function array2string($data, $isformdata = 1) {
	  if($data == \'\') return \'\';
	  if($isformdata) $data = $this->new_stripslashes($data);
	  return addslashes(var_export($data, TRUE));
	}

	/**
	 * 返回经stripslashes处理过的字符串或数组
	 * @param $string 需要处理的字符串或数组
	 * @return mixed
	 */
	private function new_stripslashes($string) {
	  if(!is_array($string)) return stripslashes($string);
	  foreach($string as $key => $val) $string[$key] = $this->new_stripslashes($val);
	  return $string;
	}

}

// $word = new dict(\"express\");
// $word ->content();

以上就是本文的全部内容了，非常实用的功能，希望小伙伴们能够喜欢。

本文地址：https://www.stayed.cn/item/10461

转载请注明出处。

本站部分内容来源于网络,如侵犯到您的权益,请联系我

微信
QQ好友
QQ空间
腾讯微博
新浪微博
人人网

我的博客

人生若只如初见，何事秋风悲画扇。

我的标签

随笔档案

2024-02(2)
2023-06(1)
2023-05(1)
2023-04(14)
2023-03(3)
2023-01(6)
2022-12(5)
2022-11(5)
2022-07(2)
2022-06(4)
2022-05(3)
2022-03(1)
2021-12(6)
2021-11(1)
2021-10(3)
2021-09(5)
2021-07(5)
2021-02(2)
2021-01(7)
2020-12(18)
2020-11(14)
2020-10(12)
2020-09(10)
2020-08(22)
2020-07(2)
2020-06(1)
2020-04(5)
2020-03(9)
2020-02(7)
2020-01(9)
2019-12(8)
2019-11(10)
2019-10(11)
2019-09(17)
2019-08(16)
2019-07(6)
2019-06(3)
2019-04(1)
2019-03(8)
2019-02(5)
2019-01(1)
2018-11(2)
2018-10(3)
2018-09(1)
2018-08(3)
2018-07(3)
2018-06(7)
2018-04(4)
2018-03(5)
2018-02(4)
2018-01(22)
2017-12(3)
2017-11(5)
2017-10(15)
2017-09(26)
2017-08(1)
2017-07(3)