首页 php教程 php手册 分享下页面关键字抓取components.arrow.com站点代码

分享下页面关键字抓取components.arrow.com站点代码

Jun 13, 2016 am 09:44 AM

复制代码 代码如下:


 /**
 * HOST: components.arrow.com
 */
 //set_time_limit(0);
 // base function
 function curl_get($url, $data = array(), $header = array(), $timeout = 15, $port = 80, $reffer = '', $proxy = '')
 {
 $ch = curl_init();
 if (!empty($data)) {
 $data = is_array($data)?http_build_query($data): $data;
 $url .= (strpos($url,'?')? '&': "?") . $data;
 }
 curl_setopt($ch, CURLOPT_URL, $url);
 curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
 curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout);
 curl_setopt($ch, CURLOPT_POST, 0);
 curl_setopt($ch, CURLOPT_PORT, $port);
 curl_setopt($ch, CURLOPT_HTTPHEADER, $header);
 curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1); //是否抓取跳转后的页面
 $reffer && curl_setopt($ch, CURLOPT_REFERER, $reffer);
 if($proxy) {
 curl_setopt($ch, CURLOPT_PROXY, $proxy);
 curl_setopt($ch, CURLOPT_PROXYPORT, 1723);
 curl_setopt($ch, CURLOPT_PROXYUSERPWD,"andhm001:andhm123");
 }

$result = array();
 $result['result'] = curl_exec($ch);
 if (0 != curl_errno($ch)) {
 $result['error'] = "Error:\n" . curl_error($ch);

}
 curl_close($ch);
 return $result;
 }

复制代码 代码如下:


function curl_post($url, $data = array(), $header = array(), $timeout = 15, $port = 80)
 {
 $ch = curl_init();
 curl_setopt($ch, CURLOPT_URL, $url);
 curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
 curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout);
 curl_setopt($ch, CURLOPT_PORT, $port);
 !empty ($header) && curl_setopt($ch, CURLOPT_HTTPHEADER, $header);
 curl_setopt($ch, CURLOPT_POST, 1);
 curl_setopt($ch, CURLOPT_POSTFIELDS, $data);

$result = array();
 $result['result'] = curl_exec($ch);
 if (0 != curl_errno($ch)) {
 $result['error'] = "Error:\n" . curl_error($ch);

}
 curl_close($ch);

return $result;
 }

/**
 * 获取列表页的html源码
 * @param string $keywords 搜索关键字
 * @param int $start 开始记录数
 * @return boolean|array
 */
 function getListHtml($keywords, $start = 0)
 {
 if ($start  {
 return false;
 }

$postData = array(
 'search_token' => $keywords,
 'start' => $start,
 'limit' => 100,
 );

$result = curl_post('http://components.arrow.com/part/search/' . $keywords, http_build_query($postData));
 if ( isset($result['error']) )
 {
 return false;
 //exit($result['error']);
 }
 $result = $result['result'];

return $result;
 }

/**
 * 获取列表页 连接href
 * @param string $html html源码
 * @return array
 */
 function getListHref($html)
 {
 $pattern = '/

]+)">/isU';
 if (preg_match_all($pattern, $html, $matches))
 {
 return $matches[1];
 } else {
 // 没有匹配项
 return array();
 }
 }

/**
 * 获取下一页数字start
 * @param string $html html源码
 * @return number
 */
 function getListNextPage($html)
 {
 $pattern = '/<script>buildPagination\(\'\d+\',\'\d+\',\'(\d+)\',\d+\);<\/script>/isU';<BR> if (preg_match($pattern, $html, $matches))<BR> {<BR> return intval($matches[1]);<BR> } else {<BR> return -1;<BR> }<BR> }<br><br>/**<BR> * 获取列表也所有的详细列表<BR> * @param string $keywords 搜索关键字<BR> * @return boolean|array<BR> */<BR> function getListHrefAll($keywords)<BR> {<BR> if (empty($keywords))<BR> {<BR> return false;<BR> }<br><br>$html = getListHtml($keywords);<BR> $hrefList = getListHref($html);<BR> if (empty($hrefList))<BR> {<BR> // 没有结果<BR> return array();<BR> }<BR> $nextPage = getListNextPage($html);<BR> $loop =0;<BR> while ($nextPage > 0)<BR> {<BR> $html = getListHtml($keywords, $nextPage);<BR> $tmpHrefList = getListHref($html);<BR> $hrefList = array_merge($hrefList, $tmpHrefList);<BR> $nextPage = getListNextPage($html);<BR> $loop ++;<BR> }<BR> return $hrefList;<BR> }<br><br>/**<BR> * 获取详情页信息<BR> * @param string $url url地址<BR> * @return array()<BR> */<BR> function getDetail($url)<BR> {<BR> if ( empty($url) )<BR> {<BR> return false;<BR> }<BR> $host = 'http://components.arrow.com';<br><br>$url = $host . $url;<BR> $result = curl_get($url);<BR> if ( isset($result['error']) )<BR> {<BR> return array();<BR> //exit($result['error']);<BR> }<BR> $html = $result['result'];<br><br>$result = array(<BR> 'sup_part' => '', // 供应商型<BR> 'sup_id' => '', // 供应商ID<BR> 'mfg_part' => '', // 制造商型号<BR> 'mfg_name' => '', // 制造商名称<BR> 'cat_name' => '', // 分类名称<BR> 'para' => '', // 属性<BR> 'desc' => '', // 描述<BR> 'pdf_url' => '', // PDF地址<BR> 'sup_stock' => '', // 库存<BR> 'min_purch' => '', // 最小订购量<BR> 'price' => '', // 价格<BR> 'img_url' => '', // 图片地址<BR> 'createtime' => '', // 创建时间<BR> 'datacode' => '', // 批号<BR> 'package' => '', // 封装<BR> 'page_url' => '', // 页面地址<BR> );<br><br>// mfg_part<BR> $pattern = '/<li>[\s\n]*<strong>Part No:\s*<\/strong>(.+)<\/li>/isU';<BR> if (preg_match($pattern, $html, $matches))<BR> {<BR> $result['mfg_part'] = trim($matches[1]);<BR> } else {file_put_contents('page.txt', $html);die('xxx');<BR> return array();<BR> }<br><br>// mfg_name<BR> $pattern = '/<li>[\s\n]*<strong>Manufacturer: <\/strong>(.+)<\/li>/isU';<BR> if (preg_match($pattern, $html, $matches))<BR> {<BR> $result['mfg_name'] = trim($matches[1]);<BR> }<br><br>// cat_name<BR> $pattern = '/displayCategory\(\'(.[^\']+)\'\);/isU';<BR> if (preg_match($pattern, $html, $matches))<BR> {<BR> $result['cat_name'] = trim($matches[1]);<BR> $result['cat_name'] = str_replace('|', '>', $result['cat_name']);<BR> }<br><br>// para<BR> $tablepattern = '/<table\s+id="part_specs".[^>]*>(.+)<\/table>/isU';<BR> if (preg_match($tablepattern, $html, $matches))<BR> {<BR> $pattern = '/<tr>[\s\n]*<td><strong>(.+)<\/strong><\/td><td>(.+)<\/td>[\s\n]*<\/tr>/isU';<BR> if (preg_match_all($pattern, $matches[1], $matches))<BR> {<BR> foreach($matches[1] as $k=>$v)<BR> {<BR> $v = trim($v);<BR> if ('Package Type' == $v)<BR> {<BR> $result['package'] = trim($matches[2][$k]);<BR> continue;<BR> }<BR> $result['para'][$v] = trim($matches[2][$k]);<BR> }<BR> }<BR> }<br><br>// desc<BR> $pattern = '/<div\s+id="part_title">.+<h4>(.+)<\/h4>[\s\n]*<\/div>/isU';<BR> if (preg_match($pattern, $html, $matches))<BR> {<BR> $result['desc'] = trim($matches[1]);<BR> }<br><br>// pdf_url<BR> $pattern = '/<li\s+class="datasheet">[\s\n]*<strong>Datasheet:<\/strong><a\s+href="(.[^"]+)"/isU';<BR> if (preg_match($pattern, $html, $matches))<BR> {<BR> $result['pdf_url'] = $host . trim($matches[1]);<BR> }<br><br>// sup_stock<BR> $pattern = '/<td\s+id="inv_1"\s+class="li_inv">([\d,]+)<\/td>/isU';<BR> if (preg_match($pattern, $html, $matches))<BR> {<BR> $result['sup_stock'] = trim($matches[1]);<BR> $result['sup_stock'] = str_replace(',', '', $result['sup_stock']);<BR> }<br><br>// min_purch<BR> $pattern = '/<span\s+id="multiples">[\s\n]*<strong>Multiple:\s*<\/strong>(.+)<\/span>/isU';<BR> if (preg_match($pattern, $html, $matches))<BR> {<BR> $result['min_purch'] = trim($matches[1]);<BR> }<br><br>// price<BR> $pattern = '/<div\s+id="price_1"\s+class="li_price">(.[^<]+)<\/div>/isU';<BR> if (preg_match($pattern, $html, $matches))<BR> {<BR> $result['price'][1] = trim($matches[1]);<BR> }<BR> $pattern = '/<div\s+id="price_1"\s+class="li_price">[\s\n]*<span.[^>]+title="(.[^"]+)">/isU';<BR> if (preg_match($pattern, $html, $matches))<BR> {<BR> $priceurl = str_replace('&', '&', $matches[1]);<BR> $json = curl_get($priceurl);<BR> $json = $json['result'];<BR> if (! empty($json))<BR> {<BR> $jsonresult = json_decode($json, true);<BR> foreach ($jsonresult['parts'][0]['webprice']['resale'] as $k=>$v)<BR> {<BR> $result['price'][$v['minqty']] = $v['price'];<BR> }<BR> }<BR> }<br><br>// img_url<BR> $pattern = '/<div\s+id="part_image">[\s\n]*<img src="/static/imghw/default1.png" data-src="(.[^" class="lazy" \s+]+)"/isU';<BR alt="分享下页面关键字抓取components.arrow.com站点代码" > if (preg_match($pattern, $html, $matches))<BR> {<BR> $result['img_url'] = trim($matches[1]);<BR> }<br><br>// page_url<BR> $result['page_url'] = $url;<br><br>return $result;<BR> }<br><br>/**<BR> * 最终调用函数<BR> * @param string $keywords 搜索关键字<BR> * @return array<BR> */<BR> function getData($keywords)<BR> {<BR> $hrefList = getListHrefAll($keywords);<BR> $result = array();<br><br>foreach ($hrefList as $k=>$v)<BR> {<BR> $result[] = getDetail($v);<BR> }<br><br>return $result;<BR> }<br><br>// Test Script<BR> $keywords = trim($_GET['keywords']);<BR> $result = getData($keywords);<br><br>print_r($result);<BR> </script>
本站声明
本文内容由网友自发贡献,版权归原作者所有,本站不承担相应法律责任。如您发现有涉嫌抄袭侵权的内容,请联系admin@php.cn

热AI工具

Undresser.AI Undress

Undresser.AI Undress

人工智能驱动的应用程序,用于创建逼真的裸体照片

AI Clothes Remover

AI Clothes Remover

用于从照片中去除衣服的在线人工智能工具。

Undress AI Tool

Undress AI Tool

免费脱衣服图片

Clothoff.io

Clothoff.io

AI脱衣机

Video Face Swap

Video Face Swap

使用我们完全免费的人工智能换脸工具轻松在任何视频中换脸!

热门文章

<🎜>:泡泡胶模拟器无穷大 - 如何获取和使用皇家钥匙
3 周前 By 尊渡假赌尊渡假赌尊渡假赌
北端:融合系统,解释
3 周前 By 尊渡假赌尊渡假赌尊渡假赌

热工具

记事本++7.3.1

记事本++7.3.1

好用且免费的代码编辑器

SublimeText3汉化版

SublimeText3汉化版

中文版,非常好用

禅工作室 13.0.1

禅工作室 13.0.1

功能强大的PHP集成开发环境

Dreamweaver CS6

Dreamweaver CS6

视觉化网页开发工具

SublimeText3 Mac版

SublimeText3 Mac版

神级代码编辑软件(SublimeText3)

热门话题

Java教程
1664
14
CakePHP 教程
1423
52
Laravel 教程
1318
25
PHP教程
1269
29
C# 教程
1248
24