获取网页HTML元素内容方法①通过 正则表达式 匹配获取
View Code 1 string resquestUrl = string .Empty; 2 // 过来HTML元素方法 3 Regex rxGetInfo = new Regex( " <label for=\"caller\">.*?</label> " , RegexOptions.IgnoreCase); 4 Regex rxFilter = new Regex( " <.*?> " ); 5 HttpWebRequest request = WebRequest.Create(resquestUrl) as HttpWebRequest; 6 HttpWebResponse response = request.GetResponse() as HttpWebResponse; 7 StreamReader sr = new StreamReader(response.GetResponseStream()); 8 string returnContent = sr.ReadToEnd(); 9 sr.Close(); 10 response.Close(); 11 MatchCollection mc = rxGetInfo.Matches(returnContent);
②根据元素属性 GetElementById获取 HtmlDocument temphtml = new HtmlDocument(); temphtml.GetElementById();③过滤html标签
View Code 1 /// <summary> 2 /// 过滤html标签 3 /// </summary> 4 /// <param name="strHtml"> html的内容 </param> 5 /// <returns></returns> 6 public static string StripHTML( string stringToStrip) 7 { 8 // paring using RegEx // 9 stringToStrip = Regex.Replace(stringToStrip, " </p(?:\\s*)>(?:\\s*)<p(?:\\s*)> " , " \n\n " , RegexOptions.IgnoreCase | RegexOptions.Compiled); 10 stringToStrip = Regex.Replace(stringToStrip, " <br(?:\\s*)/> " , " \n " , RegexOptions.IgnoreCase | RegexOptions.Compiled); 11 stringToStrip = Regex.Replace(stringToStrip, " \" " , " '' " , RegexOptions.IgnoreCase | RegexOptions.Compiled); 12 stringToStrip = StripHtmlXmlTags(stringToStrip); 13 return stringToStrip; 14 } 15 16 private static string StripHtmlXmlTags( string content) 17 { 18 return Regex.Replace(content, " <[^>]+> " , "" , RegexOptions.IgnoreCase | RegexOptions.Compiled); 19 }
转载于:https://www.cnblogs.com/angleSJW/archive/2011/06/29/2093106.html
