今日发现的：一个类似Google Baidu的搜索引擎[C#]代码比较简单

it2025-02-03 66

里面实现很简单，当然只能算是一个演示版本吧！不可能有Google Baidu的那么强大。有爬虫代码 Web代码主要的代码分析如下： /// <summary> /// 取得该URL网页内容数据.调用后面的分析函数。 /// </summary> private void GetHTML() { if (_HTMLData != null) return; Byte[] BinData; try { WebClient wc = new WebClient(); BinData = wc.DownloadData(_URL); _HTMLData = Encoding.Default.GetString(BinData); wc.Dispose(); } catch (Exception) { Console.WriteLine("Can not read this page!"); return; } GetTitle(); GetMeta(); GetLink(); if (OnFinishAnalyze != null) { OnFinishAnalyze(this, new EventArgs()); } Console.WriteLine("Finish!"); this.Dispose(); } /// <summary> /// 分析URL里的Title /// </summary> private void GetTitle() { if (_Title != null) return; //"(<title>([\S\s]+)</title>)" 提取网页标题的正则表达式 Regex reg = new Regex(@"<title>([\S\s]+)</title>", RegexOptions.IgnoreCase); try{ Match m = reg.Matches(_HTMLData)[0]; if (m.Success) _Title = m.Groups[1].Captures[0].ToString(); } catch (Exception) { _Title = ""; } } /// <summary> /// 解析Html代码里的超链接.获得子URL集. /// </summary> private void GetLink() { if (_ChildURLSet != null) return; ArrayList urlset=new ArrayList(); //"<a[\\s]+href=\"?([\\S]+)\"?[^ <>]+>([^ <>]+)</a>" //提取超链接的正则表达式 Regex reg = new Regex("<a[\\s]+href=\"?([\\S]+)\"?[^<>]+>([^<>]+?)</a>", RegexOptions.IgnoreCase); MatchCollection mm; try { mm = reg.Matches(_HTMLData); } catch (Exception) { return; } urlset.Add(new QLinkURL(this._URL, "", this._MetaWords,this._Title)); foreach (Match m in mm) { urlset.Add(new QLinkURL(URLJoin(m.Groups[1].Captures[0].ToString()), m.Groups[2].Captures[0].ToString().Replace(" ",""), "")); } _ChildURLSet = (QLinkURL[])urlset.ToArray(System.Type.GetType("QSplider.QLinkURL")); } private void GetMeta() { if (_MetaWords != null) return; //<meta[\S\s]+name="?keywords"?[\S\s]+content="?([\S\s]+)"?[\S\s]+></meta> Regex reg = new Regex("<meta[\\S\\s]+name=\"?keywords\"?[\\S\\s]+content=\"?([\\S\\s]+)\"?[\\S\\s]+></meta>", RegexOptions.IgnoreCase); try{ Match m = reg.Matches(_HTMLData)[0]; if (m.Success) _MetaWords = m.Groups[1].Captures[0].ToString(); }catch(Exception){ _MetaWords = "";} } public void Dispose() { this._HTMLData = null; this._ChildURLSet = null; this._MetaWords = null; this._Title = null; } public string URLJoin(string s2) { s2=s2.Trim(_SplitChar); if (s2.StartsWith("http://", true,null)) return s2; if(s2.StartsWith("/")) s2.Substring(1,s2.Length-1); if (_URL.LastIndexOf("/") > 9) _URL = _URL.Substring(0,_URL.LastIndexOf("/")); return (_URL + "/" + s2).Trim(_SplitChar); } 主要工作的函数就是上面的几个了！这是一个爬虫的代码！当然还有数据库的，还有web的！下载地址如下： http://www.libing.net.cn/attachment.php?f=attachment%2F%2Fqsearch.splider.zip http://www.libing.net.cn/attachment.php?f=attachment%2F%2Fqsearch.website.zip http://59.70.157.222/QSearch.WebSite.zip

转载于:https://www.cnblogs.com/wbbady/archive/2007/07/10/812006.html

最新回复(0)