php采集页面内容并自动转码

发布时间:2016-01-12 15:11:40编辑:丝画阁阅读(351)

    /*
     * 用法 get_contents('www.yi210.com', 'utf-8');
     * 采集页面内容并自动转码
     * get_contents()自定义函数
     * $url 需要采集的页面地址
     * $timeout 超时时间,默认20
     */
    function get_contents($url, $timeout = 20)
    {
        if( function_exists('curl_init') ){
            $ch = curl_init();
            curl_setopt( $ch, CURLOPT_URL, $url );
            curl_setopt( $ch, CURLOPT_HEADER, false );
            curl_setopt( $ch, CURLOPT_TIMEOUT, $timeout );
            curl_setopt( $ch, CURLOPT_RETURNTRANSFER, 1 );
            curl_setopt( $ch, CURLOPT_CONNECTTIMEOUT, $timeout );    
            $content = curl_exec( $ch );
            curl_close( $ch );
            $data = $content ? $content : false;
        } else {
            //利用了stream_context_create()设置超时时间:
            $pots = array(
                'http' => array(
                    'timeout' => $timeout
                    )
                );
            $context = stream_context_create( $pots );
            $content = @file_get_contents( $url, false, $context );
            $data = $content ? $content : false;
        }    
        return $data ? my_encoding( $content, 'utf-8' ) : false;
    }
    /*
     * 页面内容并自动转码
     * my_encoding()自定义函数
     * $data 为 curl_exec() 或 file_get_contents() 所获得的页面内容
     * $to 需要转成的编码
     */
    function my_encoding( $data, $to )
    {
        $encode_arr = array('UTF-8','ASCII','GBK','GB2312','BIG5','JIS','eucjp-win','sjis-win','EUC-JP');
        $encoded = mb_detect_encoding($data, $encode_arr);
        $data = mb_convert_encoding($data,$to,$encoded);
        return $data;
    }

关键字