原文链接:http://blog.csdn.net/xyzhaopeng/article/details/6626340
从一个HTML页面的一个表格中提取数据并且将这个数据整理出来加入到MySQL数据库中。
假设目标HTML中我感兴趣的Table有3列,分别是ID,Name,内容。
index.php
<pre
class="php" name="code"><?
php
/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/
$urlTarget = "http://www.xxxx.com/targethtmlpage.html"
;
require_once('ContentManager.php'
);
//建立Dom对象,分析HTML文件;
$htmDoc =
new DOMDocument;
$htmDoc->loadHTMLFile(
$urlTarget );
$htmDoc->
normalizeDocument();
//获得到此文档中每一个Table对象;
$tables_list =
$htmDoc->getElementsByTagName('table'
);
//测试Table Count;
$tables_count =
$tables_list->
length;
foreach (
$tables_list as $table)
{
//得到Table对象的class属性
$tableProp =
$table->getAttribute('class'
);
if (
$tableProp == 'target_table_class'
)
{
$contentMgr =
new ContentManager();
$contentMgr->ParseFromDOMElement(
$table);
//这里myParser就完成了分析动作。然后就可以进行需要的操作了。
//比如写入MySQL。
$contentMgr->
SerializeToDB();
}
}
?>
</pre><br>
ContentManager.php
<?
php
/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/
/**
* Description of ContentParser
*
* @author xxxxx
*/
require_once('ContentInfo.php'
);
class ContentManager {
//put your code here
var $ContentList;
public function __construct() {
$this->ContentList =
new ArrayObject();
}
public function ParseFromDOMElement(DOMElement
$table)
{
$rows_list =
$fundsTable->getElementsByTagName('tr'
);
$rows_length =
$rows_list->
length;
$index = 0
;
foreach (
$rows_list as $row)
{
$contentInfo =
new ContentInfo();
$contentInfo->ParseFromDOMElement(
$row);
$this->ContentList->append (
$contentInfo);
}
//test how many contents parsed.
$count =
$this->fundsInfoArray->
count();
echo $count;
}
public function SerializeToDB()
{
//写入数据库,代码略。
}
}
?>
contentinfo.php
<?
php
/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/
/**
* Description of ContentInfo
*
* @author xxxxx
*/
class ContentInfo {
//put your code here
var $ID;
var $Name;
var $Content;
public function ParseFromDOMElement(DOMElement
$row)
{
$cells_list =
$row->getElementsByTagName('td'
);
$cells_length =
$row->
length;
$curCellIdx = 0
;
foreach (
$cells_list as $cell)
{
switch (
$curCellIdx++
)
{
case 0:
$this->ID =
$cell->
nodeValue;
break;
case 1:
$this->Name =
$cell->
nodeValue;
break;
case 2:
$this->Content =
$cell->
nodeValue;
break;
}
}
}
}
?>
转载于:https://www.cnblogs.com/CheeseZH/p/4858293.html