public class MoviePaperPageProcessor implements PageProcessor { private Site page = Site.me().setRetryTimes(3).setSleepTime(1000);
public Site getSite() { return page; } String url = "jdbc:oracle:thin:@192.168.2.161:1521:orcl" ; String username = "hecv_ay" ; String password = "000000" ;
public void process(Page page) { ChineseCharToEn cte = new ChineseCharToEn(); // System.out.println("获取拼音首字母:"+ cte.getAllFirstLetter("西琉璃村委会")); // String xml = page.getHtml().toString(); List<String> codes=page.getHtml().xpath("//table[@class='villagetable']//tr[@class='villagetr']/td[1]/text()").all(); List<String> names=page.getHtml().xpath("//table[@class='villagetable']//tr[@class='villagetr']/td[3]/text()").all(); System.out.println(codes.get(3)+"---"+names.get(3)); try{ Connection conn = DriverManager.getConnection(url , username , password ) ; conn.setAutoCommit(false); PreparedStatement ps =conn.prepareStatement("INSERT INTO HECV_AY.CM_ADMINISTRATIVE_AREA (ID, CODE, NAME, SHORTNAME, LOOKUP, DEGREE, POS, PARENT, ANCESTOR, DEPTH, TERMINAL, DELETED, CREATED_TIME, LAST_MODIFIED_TIME)" + "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"); for(int i=0;i<codes.size();i++){ ps.setString(1, UUID.randomUUID().toString()); ps.setString(2, codes.get(i)); ps.setString(3, names.get(i)); ps.setString(4, names.get(i)); ps.setString(5, cte.getAllFirstLetter(names.get(i))); ps.setString(6, "70"); ps.setString(7, 10*(i+1)+""); ps.setString(8, "f3505dfd-b255-4df6-84e5-09dd03b666db"); ps.setString(9, "(0d0c03f6-934f-40b2-bb4c-b27846f5e987),(12f196e1-d4b8-4c87-8e37-a5bf15d69222),(14ea5270-8e17-4071-9e2f-b34434be1b4b),(40043731-734c-4e65-aba9-5e5928644931),(f3505dfd-b255-4df6-84e5-09dd03b666db)"); ps.setInt(10, 6); ps.setInt(11, 0); ps.setInt(12, 0); ps.setDate(13, new java.sql.Date(new Date().getTime())); ps.setDate(14, new java.sql.Date(new Date().getTime())); ps.addBatch(); } ps.executeBatch(); conn.commit(); conn.close(); }catch(SQLException se){ System.out.println("数据库连接失败!"); se.printStackTrace() ; } }
public static void main(String[] args) { Spider.create(new MoviePaperPageProcessor()) .addUrl("http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2015/41/05/26/410526103.html") .thread(5).run(); }
}
//获取 汉字首字母
public class ChineseCharToEn { private final static int[] li_SecPosValue = { 1601, 1637, 1833, 2078, 2274, 2302, 2433, 2594, 2787, 3106, 3212, 3472, 3635, 3722, 3730, 3858, 4027, 4086, 4390, 4558, 4684, 4925, 5249, 5590 }; private final static String[] lc_FirstLetter = { "a", "b", "c", "d", "e", "f", "g", "h", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "w", "x", "y", "z" };
/** * 取得给定汉字串的首字母串,即声母串 * @param str 给定汉字串 * @return 声母串 */ public String getAllFirstLetter(String str) { if (str == null || str.trim().length() == 0) { return ""; }
String _str = ""; for (int i = 0; i < str.length(); i++) { _str = _str + this.getFirstLetter(str.substring(i, i + 1)); }
return _str; }
/** * 取得给定汉字的首字母,即声母 * @param chinese 给定的汉字 * @return 给定汉字的声母 */ public String getFirstLetter(String chinese) { if (chinese == null || chinese.trim().length() == 0) { return ""; } chinese = this.conversionStr(chinese, "GB2312", "ISO8859-1");
if (chinese.length() > 1) // 判断是不是汉字 { int li_SectorCode = (int) chinese.charAt(0); // 汉字区码 int li_PositionCode = (int) chinese.charAt(1); // 汉字位码 li_SectorCode = li_SectorCode - 160; li_PositionCode = li_PositionCode - 160; int li_SecPosCode = li_SectorCode * 100 + li_PositionCode; // 汉字区位码 if (li_SecPosCode > 1600 && li_SecPosCode < 5590) { for (int i = 0; i < 23; i++) { if (li_SecPosCode >= li_SecPosValue[i] && li_SecPosCode < li_SecPosValue[i + 1]) { chinese = lc_FirstLetter[i]; break; } } } else // 非汉字字符,如图形符号或ASCII码 { chinese = this.conversionStr(chinese, "ISO8859-1", "GB2312"); chinese = chinese.substring(0, 1); } }
return chinese; }
/** * 字符串编码转换 * @param str 要转换编码的字符串 * @param charsetName 原来的编码 * @param toCharsetName 转换后的编码 * @return 经过编码转换后的字符串 */ private String conversionStr(String str, String charsetName,String toCharsetName) { try { str = new String(str.getBytes(charsetName), toCharsetName); } catch (UnsupportedEncodingException ex) { System.out.println("字符串编码转换异常:" + ex.getMessage()); } return str; }
public static void main(String[] args) { ChineseCharToEn cte = new ChineseCharToEn(); System.out.println("获取拼音首字母:"+ cte.getAllFirstLetter("西琉璃村委会")); }
}
转载于:https://www.cnblogs.com/li-xy/p/5739831.html
相关资源:webMagic爬虫抓取某个博客全部文章名称