叶雨梧桐BLOG

PHP完整的拉取网页数据

发布时间:2015年10月16日 / 分类:PHP笔记 / 吐槽

可恶的360,万恶的360,一下午的时间全造在你身上了。
因开发需要拉取快递信息,就用的快递100的wap查询api,没想到本地很嗨皮。移到开发环境就挂了。一下午就成功了三次,我那个心啊,啥也不说了。打印一看,每次失败都有360那货的存在。无奈了。

看下面的码队,也是醉了。这代码高亮还是要重新做的。 = =||

有码有真相

$url = "http://wap.kuaidi100.com/wap_result.jsp?rand=" . time() . "&id=shunfeng&fromWeb=null&postid=905483748587";

ihttp_request($url);

function ihttp_request($url, $post = '', $extra = array(), $timeout = 60) {
$urlset = parse_url($url);

if (empty($urlset['path'])) {
    $urlset['path'] = '/';
}
if (!empty($urlset['query'])) {
    $urlset['query'] = "?{$urlset['query']}";
}
if (empty($urlset['port'])) {
    $urlset['port'] = $urlset['scheme'] == 'https' ? '443' : '80';
}

if (function_exists('curl_init') && function_exists('curl_exec')) {
    $ch = curl_init();
    if (!empty($extra['ip'])) {
        $extra['Host'] = $urlset['host'];
        $urlset['host'] = $extra['ip'];
        unset($extra['ip']);
    }
    curl_setopt($ch, CURLOPT_URL, $urlset['scheme'] . '://' . $urlset['host'] . $urlset['path'] . $urlset['query']);
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
    @curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
    curl_setopt($ch, CURLOPT_HEADER, 1);
    curl_setopt($ch, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_1_0);
    if ($post) {
        if (is_array($post)) {
            $filepost = false;
            foreach ($post as $name => $value) {
                if (substr($value, 0, 1) == '@' || (class_exists('CURLFile') && $value instanceof CURLFile)) {
                    $filepost = true;
                    break;
                }
            }
            if (!$filepost) {
                $post = http_build_query($post);
            }
        }
        curl_setopt($ch, CURLOPT_POST, 1);
        curl_setopt($ch, CURLOPT_POSTFIELDS, $post);
    }
    if (!empty($GLOBALS['_W']['config']['setting']['proxy'])) {
        $urls = parse_url($GLOBALS['_W']['config']['setting']['proxy']['host']);
        curl_setopt($ch, CURLOPT_PROXY, "{$urls['host']}:{$urls['port']}");
        $proxytype = 'CURLPROXY_' . strtoupper($urls['scheme']);
        if (!empty($urls['scheme']) && defined($proxytype)) {
            curl_setopt($ch, CURLOPT_PROXYTYPE, constant($proxytype));
        } else {
            curl_setopt($ch, CURLOPT_PROXYTYPE, CURLPROXY_HTTP);
            curl_setopt($ch, CURLOPT_HTTPPROXYTUNNEL, 1);
        }
        if (!empty($GLOBALS['_W']['config']['setting']['proxy']['auth'])) {
            curl_setopt($ch, CURLOPT_PROXYUSERPWD, $GLOBALS['_W']['config']['setting']['proxy']['auth']);
        }
    }
    curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout);
    curl_setopt($ch, CURLOPT_TIMEOUT, $timeout);
    curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
    curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, 0);
    curl_setopt($ch, CURLOPT_SSLVERSION, 1);
    if (defined('CURL_SSLVERSION_TLSv1')) {
        curl_setopt($ch, CURLOPT_SSLVERSION, CURL_SSLVERSION_TLSv1);
    }
    curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:9.0.1) Gecko/20100101 Firefox/9.0.1');
    if (!empty($extra) && is_array($extra)) {
        $headers = array();
        foreach ($extra as $opt => $value) {
            if (strexists($opt, 'CURLOPT_')) {
                curl_setopt($ch, constant($opt), $value);
            } elseif (is_numeric($opt)) {
                curl_setopt($ch, $opt, $value);
            } else {
                $headers[] = "{$opt}: {$value}";
            }
        }
        if (!empty($headers)) {
            curl_setopt($ch, CURLOPT_HTTPHEADER, $headers);
        }
    }
    $data = curl_exec($ch);
    $status = curl_getinfo($ch);
    $errno = curl_errno($ch);
    $error = curl_error($ch);
    curl_close($ch);
    if ($errno || empty($data)) {
        
        return error(1, $error);
    } else {
        return ihttp_response_parse($data);
    }
}
$method = empty($post) ? 'GET' : 'POST';
$fdata = "{$method} {$urlset['path']}{$urlset['query']} HTTP/1.1\r\n";
$fdata .= "Host: {$urlset['host']}\r\n";
if (function_exists('gzdecode')) {
    $fdata .= "Accept-Encoding: gzip, deflate\r\n";
}
$fdata .= "Connection: close\r\n";
if (!empty($extra) && is_array($extra)) {
    foreach ($extra as $opt => $value) {
        if (!strexists($opt, 'CURLOPT_')) {
            $fdata .= "{$opt}: {$value}\r\n";
        }
    }
}
$body = '';
if ($post) {
    if (is_array($post)) {
        $body = http_build_query($post);
    } else {
        $body = urlencode($post);
    }
    $fdata .= 'Content-Length: ' . strlen($body) . "\r\n\r\n{$body}";
} else {
    $fdata .= "\r\n";
}
if ($urlset['scheme'] == 'https') {
    $fp = fsockopen('ssl://' . $urlset['host'], $urlset['port'], $errno, $error);
} else {
    $fp = fsockopen($urlset['host'], $urlset['port'], $errno, $error);
}
stream_set_blocking($fp, true);
stream_set_timeout($fp, $timeout);
if (!$fp) {
    return error(1, $error);
} else {
    fwrite($fp, $fdata);
    $content = '';
    while (!feof($fp))
        $content .= fgets($fp, 512);
    fclose($fp);
    return ihttp_response_parse($content, true);
    
}

}

function ihttp_response_parse($data, $chunked = false) {
$rlt = array();
$headermeta = explode('HTTP/', $data);
if (count($headermeta) > 2) {
$data = 'HTTP/' . array_pop($headermeta);
}
$pos = strpos($data, "\r\n\r\n");
$split1[

$split2 = explode("\r\n", $split1[0], 2);
preg_match('/^(\S+) (\S+) (\S+)$/', $split2[0], $matches);
$rlt['code'] = $matches[2];
$rlt['status'] = $matches[3];
$rlt['responseline'] = $split2[0];
$header = explode("\r\n", $split2[1]);
$isgzip = false;
$ischunk = false;
foreach ($header as $v) {
    $pos = strpos($v, ':');
    $key = substr($v, 0, $pos);
    $value = trim(substr($v, $pos + 1));
    if (is_array($rlt['headers'][$key])) {
        $rlt['headers'][$key][] = $value;
    } elseif (!empty($rlt['headers'][$key])) {
        $temp = $rlt['headers'][$key];
        unset($rlt['headers'][$key]);
        $rlt['headers'][$key][] = $temp;
        $rlt['headers'][$key][] = $value;
    } else {
        $rlt['headers'][$key] = $value;
    }
    if(!$isgzip && strtolower($key) == 'content-encoding' && strtolower($value) == 'gzip') {
        $isgzip = true;
    }
    if(!$ischunk && strtolower($key) == 'transfer-encoding' && strtolower($value) == 'chunked') {
        $ischunk = true;
    }
}
if($chunked && $ischunk) {
    $rlt['content'] = ihttp_response_parse_unchunk($split1[1]);
} else {
    $rlt['content'] = $split1[1];
}
if($isgzip && function_exists('gzdecode')) {
    $rlt['content'] = gzdecode($rlt['content']);
}

$rlt['meta'] = $data;
if($rlt['code'] == '100') {
    return ihttp_response_parse($rlt['content']);
}
return $rlt;

}

function ihttp_response_parse_unchunk($str = null) {
if(!is_string($str) or strlen($str) < 1) {
return false;
}
$eol = "\r\n";
$add = strlen($eol);
$tmp = $str;
$str = '';
do {
$tmp = ltrim($tmp);
$pos = strpos($tmp, $eol);
if($pos === false) {
return false;
}
$len = hexdec(substr($tmp, 0, $pos));
if(!is_numeric($len) or $len < 0) {
return false;
}
$str .= substr($tmp, ($pos + $add), $len);
$tmp = substr($tmp, ($len + $pos + $add));
$check = trim($tmp);
} while(!empty($check));
unset($tmp);
return $str;
}

?>

万恶360

QQ截图20151016181930.png

标签:没有标签呢

版权声明:除特殊注明外,均由叶雨梧桐原创,转载请保留文章出处;
本文链接:http://gt520.com/phpbj/322.html
订阅叶雨梧桐BLOG RSS
分类
最新文章
最近回复
  • 梦青年: 我的网站下线了,你可以把我的友链去掉了,感谢多年的支持和关注~ 梦青年。
  • 便宜VPS: 已添加贵站链接,我站:https://www.vultrclub.com,站:便宜VPS
  • vultr: 朋友 交换链接吗
  • 罗小黑: py小白看不懂啊! Traceback (most recent call last): ...
  • 罗小黑: 安装出错,下面无法继续 >>> pip install MySQL-python Synt...
  • 一抹阳光: 学习了(ಡωಡ)
  • wu先生: 我来了。
  • 欣欣: 嗷呜 原来树莓派不是吃的
  • 繁华落尽: 我也是碰到了这种问题,求具体解决方案,谢谢
  • aaa365: 128M小内存 npm install -g hexo-cli 的时候直接就被系统kill掉进...
归档