tag系统完成,再修复一些添加词条,修改内容的问题,就可以做数据采集了。
数据采集网上的资料很多,再结合自己的需要,写了一下代码:
protected
void
Button1_Click(
object
sender, EventArgs e)
{ Lemma lemma = new Lemma(); Response.Write("采集结果:<br/><br/>"); Response.Flush(); for (int i = 0; i <= 3; i++) { string sUrl = strurl + (i * 10).ToString(); Response.Write("采集url:" + sUrl + "<br/>"); Response.Flush(); foreach (string temp in GetHtmls(@"/view/\d+\.htm", GetUrlHtml(sUrl))) { string url = u + temp; string sHtml = GetUrlHtml(url); string sLemma = GetLemma(sHtml); string sDetail = GetDetail(sHtml); string sTag = GetTag(sHtml); int idLemma = lemma.AddLemma(sLemma, sDetail, "cloud", 0, string.Empty, url, sTag); StringBuilder sb = new StringBuilder(); sb.Append("id:").Append(idLemma).Append("<br/> 词条:").Append(sLemma).Append("<br/>"); sb.Append("Tag:").Append(sTag).Append("<br/> 连接:<a href='").Append(url).Append("'' target='_blank'>").Append(url).Append("</a><br/>"); if (idLemma > 0) { sb.Append("成功!").Append(" <a href='../index/show.aspx?id=").Append(idLemma).Append("' target='_blank'>查看</a>"); ; } else { sb.Append("失败!错误代码:").Append(idLemma); } sb.Append("<br/><br/>"); Response.Write(sb.ToString()); Response.Flush(); } } }
public
static
string
GetUrlHtml(
string
url)
{ string output = ""; Encoding encode = Encoding.Default; WebClient webclient = new WebClient(); try { webclient.Headers.Add("Referer", url); byte[] buff = webclient.DownloadData(url); output = encode.GetString(buff); } catch { } return output; }
public
static
string
GetHtml(
string
begin,
string
end,
string
content)
{ return GetHtml(begin + "((.*?\\n?)*?)" + end, content); }
public
static
string
GetHtml(
string
pattern,
string
content)
{ Regex reg = new Regex(pattern); Match match = reg.Match(content); if (match != Match.Empty) { //content = content.Replace(match.Groups[1].ToString(), string.Empty); return match.Groups[1].ToString(); } else { return string.Empty; } }
public
static
StringCollection GetHtmls(
string
begin,
string
end,
string
content)
{ return GetHtmls(begin + "((.*?\\n?)*?)" + end, content); }
public
static
StringCollection GetHtmls(
string
pattern,
string
content)
{ Regex reg = new Regex(pattern); MatchCollection matches = reg.Matches(content); StringCollection list = new StringCollection(); foreach (Match match in matches) { if (match != Match.Empty) { list.Add(match.Value); } } return list; }
/**/
/// <summary> /// 正则替换 /// </summary>
public
static
string
ReplaceText(
string
input,
string
pattern,
string
replacement)
{ if (string.IsNullOrEmpty(input)) return string.Empty; Regex rgx = new Regex(pattern, RegexOptions.IgnoreCase | RegexOptions.Multiline); return rgx.Replace(input, replacement); }
/**/
/// <summary> /// 去标签 包括内容 /// </summary>
public
static
string
ClearWholeTag(
string
input,
string
tag)
{ return ReplaceText(input, @"<" + tag + "[^>]*?>.*?</" + tag + ">", ""); }
/**/
/// <summary> /// 去标签 不包括内容 /// </summary>
public
static
string
ClearTag(
string
input,
string
tag)
{ return ReplaceText(input, @"<\/?" + tag + "[^>]*>", ""); }
/**/
/// <summary> /// 去全部标签 /// </summary>
public
static
string
ClearAllTag(
string
input)
{ return ReplaceText(input, @"<\/?[a-zA-Z]+[^>]*>", ""); }
数据采集就是爽,先来三百多条吧,哈哈。
转载于:https://www.cnblogs.com/cloudgamer/archive/2008/02/19/1072923.html