最近在采集数据所以自己写了个小工具,一般的网站可以直接模拟浏览器发送请求,然后用正则表达式分析页面提取需要的信息。
1 HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url); 2 request.Method = "GET"; 3 request.Timeout = 5000; 4 request.UserAgent="Mozilla/5.0 (Windows NT 5.2; rv:27.0) Gecko/20100101 Firefox/27.0"; 5 request.Accept = " text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"; 6 request.Headers.Add("Accept-Language", "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3"); 7 request.Headers.Add("Accept-Encoding", "gzip, deflate"); 8 request.KeepAlive = true; 9 request.Referer = System.Web.HttpUtility.UrlEncode(url);10 HttpWebResponse response = (HttpWebResponse)request.GetResponse();11 string html = string.Empty;12 if (response.ContentEncoding.ToLower() == "gzip")//如果使用了GZip则先解压13 {14 using (System.IO.Stream streamReceive = response.GetResponseStream())15 {16 using (var zipStream =17 new System.IO.Compression.GZipStream(streamReceive, System.IO.Compression.CompressionMode.Decompress))18 {19 using (StreamReader sr = new System.IO.StreamReader(zipStream, Encoding.GetEncoding("gbk")))20 {21 html = sr.ReadToEnd();22 }23 }24 }25 }26 else27 {28 using (System.IO.Stream streamReceive = response.GetResponseStream())29 {30 using (System.IO.StreamReader sr = new System.IO.StreamReader(streamReceive, Encoding.GetEncoding("gbk")))31 {32 html = sr.ReadToEnd();33 }34 }35 }36 37 string pattern = "(\\d+)";38 Regex reg = new Regex(pattern, RegexOptions.IgnoreCase);39 MatchCollection matchs = reg.Matches(html);40 41 foreach (Match match in matchs)42 {43 string phone = match.Groups[1].Value;44 }
posted on 2014-04-26 22:04 阅读( ...) 评论( ...)