<?php

if (!defined('EMLOG_ROOT')) {
    exit('error!');
}
class spider{
    
    //获取匹配标签
    static function get_tag($str,$tag,$is_gbk=false)
    {
        if($is_gbk==='y')
        {
            $str = self::auto_charset($str);
        }
        $tag=preg_quote($tag,'/');
	    $tag=str_replace('\(\*\)',"(.*)",$tag);
        preg_match("/".$tag."/Uis",$str,$matches);
        return $matches;
    }
    //获取字段内容
    static function get_field($str,$start,$end,$r_tag='',$re='',$is_gbk)
    {
        $start = self::get_tag($str,$start,$is_gbk);
        $end = self::get_tag($str,$end,$is_gbk);
        $content = self::get_tag($str,$start[0].'(*)'.$end[0],$is_gbk);
        if($r_tag=='')
        {
            $content = strip_tags($content[1]);
        }
        else
        {
            $content = strip_tags($content[1],$r_tag);
        }
        if($re!=='')
        {
            if(! strpos($re,'###'))
            {
                $re = explode('|',$re);
                $re_t = self::get_tag($content,$re[0]);
                $content = str_replace($re_t[0],$re[1],$content);
            }
            else
            {
                $re = explode('###',$re);
                foreach ($re as $v)
                {
                    $v = explode('|',$v);
                    $v_t = self::get_tag($content,$v[0]);
                    $content = str_replace($v_t[0],$v[1],$content);
                }
            }
        }
        $content=preg_replace("/[\r\n\t]*/Uis","",$content);
        $content=preg_replace("/&nbsp;/Uis","",$content);
        return trim($content);
    }
    //获取整合好的内容数组
    static function get_content($str,$i,$options)
    {
        $title = self::get_field($str,$options[$i]['title_start'],$options[$i]['title_end'],$options[$i]['title_tag'],$options[$i]['title_re'],$options[$i]['is_gbk']);
        $content =self::get_field($str,$options[$i]['content_start'],$options[$i]['content_end'],$options[$i]['content_tag'],$options[$i]['content_re'],$options[$i]['is_gbk']);
        if($options[$i]['excerpt']!='')
        {
            $excerpt = self::msubstr(strip_tags($content), 0, intval($options[$i]['excerpt']));
        }
        else
        {
            $excerpt = '';
        }
        return array(
            'name' => $options[$i]['name'],
            'url' => $options[$i]['url'],
            'title' => $title,
            'content' => $content,
            'excerpt' => $excerpt,
            'hide' => $options[$i]['hide'],
            'top' => $options[$i]['top'],
            'allow_remark' => $options[$i]['allow_remark'],
            'allow_tb' => $options[$i]['allow_tb'],
        ); 
    }
    
    //获取列表URL
    static function get_list_url($options,$id)
    {
        for($i=$options[$id]['url_start'];$i<=$options[$id]['url_end'];$i=$i+$options[$id]['url_add'])
        {
            $url[] =  str_replace('(*)',$i,$options[$id]['url']);
        }
        foreach ($url as $u)
        {
            $str = Http::doGet($u);
            $list_start = self::get_tag($str,$options[$id]['list_start'],$options[$id]['is_gbk']);
            $list_end = self::get_tag($str,$options[$id]['list_end'],$options[$id]['is_gbk']);
            $list = self::get_tag($str,$list_start[0].'(*)'.$list_end[0],$options[$id]['is_gbk']);
            $link_start = self::get_tag($list[1],$options[$id]['link_start']);
            $link_end = self::get_tag($list[1],$options[$id]['link_end']);
            $link_start = preg_quote($link_start[0],'/');
            $link_end = preg_quote($link_end[0],'/');
            preg_match_all("/".$link_start."(.*)".$link_end."/Uis",$list[1],$arrlinks);
            $url_arr[] = array('list_url' => $u ,'url' =>$arrlinks[1]);
        }
        foreach ($url_arr as $key=>$value)
        {
            foreach ($value['url'] as $k=>$u)
            {
                if(! empty($options[$id]['url_in']) && ! strpos($options[$id]['url_in'],'###'))
                { 
                    if(strpos($u,$options[$id]['url_in'])===false)
                    {
                        unset($url_arr[$key]['url'][$k]);
                    }
                }
                elseif(! empty($options[$id]['url_in']))
                {
                    $url_in = explode('###',$options[$id]['url_in']);
                    foreach($url_in as $url_v)
                    {
                        if(strpos($u,$url_v)===false)
                        {
                            unset($url_arr[$key]['url'][$k]);
                        }
                    }
                }
                if(! empty($options[$id]['url_not_in']) && ! strpos($options[$id]['url_not_in'],'###'))
                {
                    if(strpos($u,$options[$id]['url_not_in'])!==false)
                    {
                        unset($url_arr[$key]['url'][$k]);
                    }
                }
                elseif(! empty($options[$id]['url_not_in']))
                {
                    $url_not_in = explode('###',$options[$id]['url_not_in']);
                    foreach($url_not_in as $url_v)
                    {
                        if(strpos($u,$url_v)!==false)
                        {
                            unset($url_arr[$key]['url'][$k]);
                        }
                    }
                }
            
                if(strpos($u, 'http://') === false) {
					if(substr($u, 0, 1) == '/') {
						$url_arr[$key]['url'][$k] = $options[$id]['host'].$u;
					} else {
						$url_arr[$key]['url'][$k] = $options[$id]['host'].'/'.$u;
					}
				}    
            }
            
            
        }
        return $url_arr;
    }
    // 自动转换字符集 支持数组转换
    static function auto_charset($fContents,$from='gbk',$to='utf-8')
    {
    $from   =  strtoupper($from)=='UTF8'? 'utf-8':$from;
    $to       =  strtoupper($to)=='UTF8'? 'utf-8':$to;
    if( strtoupper($from) === strtoupper($to) || empty($fContents) || (is_scalar($fContents) && !is_string($fContents)) )   {
        //如果编码相同或者非字符串标量则不转换
        return $fContents;
    }
    if(is_string($fContents) ) {
        if(function_exists('mb_convert_encoding')){
            return mb_convert_encoding ($fContents, $to, $from);
        }elseif(function_exists('iconv')){
            return iconv($from,$to,$fContents);
        }else{
            return $fContents;
        }
    }
    elseif(is_array($fContents)){
        foreach ( $fContents as $key => $val ) {
            $_key =     auto_charset($key,$from,$to);
            $fContents[$_key] = auto_charset($val,$from,$to);
            if($key != $_key )
                unset($fContents[$key]);
        }
        return $fContents;
    }
    else{
        return $fContents;
    }
    }
    //中文字符串截取
    static function msubstr($str, $start=0, $length, $charset="utf-8", $suffix=true)
    {
	    switch($charset)
	    {
		    case 'utf-8':$char_len=3;break;
		    case 'UTF8':$char_len=3;break;
		    default:$char_len=2;
	    }
	    //小于指定长度，直接返回
        if(strlen($str)<=($length*$char_len))
	    {	
		    return $str;
	    }
	    if(function_exists("mb_substr"))
        {   
	 	    $slice= mb_substr($str, $start, $length, $charset);
	    }
        else if(function_exists('iconv_substr'))
        {
            $slice=iconv_substr($str,$start,$length,$charset);
        }
	    else
        { 
	        $re['utf-8']   = "/[\x01-\x7f]|[\xc2-\xdf][\x80-\xbf]|[\xe0-\xef][\x80-\xbf]{2}|[\xf0-\xff][\x80-\xbf]{3}/";
		    $re['gb2312'] = "/[\x01-\x7f]|[\xb0-\xf7][\xa0-\xfe]/";
		    $re['gbk']    = "/[\x01-\x7f]|[\x81-\xfe][\x40-\xfe]/";
		    $re['big5']   = "/[\x01-\x7f]|[\x81-\xfe]([\x40-\x7e]|\xa1-\xfe])/";
		    preg_match_all($re[$charset], $str, $match);
		    $slice = join("",array_slice($match[0], $start, $length));
	    }
        if($suffix) 
		    return $slice."…";
        return $slice;
    }
}
?>
