import java.io.*;
import java.net.*;
import java.text.SimpleDateFormat;
import java.util.*;
import java.lang.String;
import com.e104.util.*;
import com.e104.db.*;
/**
* @Apid 471
* @author kevin.huang
* @email kevin.huang@104.com.tw
* @description 解析xml檔程式
* @date 2006/9/18
* @description 傳送Email、寫log檔、搜詢資料庫
* @date 2006/9/19
* @description 連接網頁內容取回部份內容值
* @date 2006/9/20
* @description 解析yahoo網頁完成
* @date 2006/9/21
* @description 資料寫入資料庫成功
* @date 2006/9/28
* @description 增加防錯測試
* @date 2006/10/1
* @description 修改查詢語法
* @date 2006/11/28
* @date 2006/11/30
* @description 修改xml語法
*/
public class Search_Yahoo
{
public static void main(String[] args) throws Exception
{
String log = "";
//儲存所需的正確資料(已排除重覆)
String clinks[] = new String[100];
String ctitles[] = new String[100];
String carticles[] = new String[100];
String dirName = null;
String fileName = null;
FileWriter fw = null;
PrintWriter pw = null;
// xml解析
//log = g.startparse();
//l.startparse();
//設定xml
E104XmlLocalHandler LocalXml = E104XmlLocalHandler.performParser();
E104XmlGlobalHandler GlobalXml = E104XmlGlobalHandler.performParser(471, LocalXml.getLocalTagValue("GlobalPtah"));
log = GlobalXml.getGlobalTagValue("apini.logpath");
System.out.println("logpath="+log);
//建立資料夾
File myDir = new File(log);
myDir.mkdir();
System.out.println(myDir+(myDir.isDirectory()?" is":" is not")+" a directory.");
//建檔
dirName = log;
fileName = "Search_Yahoo_mapping_" + new SimpleDateFormat("yyyy.MM.dd-'T'HH.mm.ss").format(new Date()) + ".log";
File output = new File (dirName,fileName);
output.createNewFile();
//System.out.println(output.getPath());
fw = new FileWriter(output.getPath(),true);
pw = new PrintWriter(fw);
E104Mail logmail = new E104Mail(); //定義寄送mail物件
//資料庫初始設定
//搜尋時用
E104Conn queryInfoDB = null; // 建立會用到的 connection
E104Data dataInfo = null; // 建立會用到的 query容器
String sqlCommand="";
//寫入時用
E104Conn wqueryInfoDB = null; // 建立會用到的 connection
E104Data wdataInfo = null; // 建立會用到的 query容器
String wsqlCommand="";
//寫入over_monitor時判斷用
E104Conn swqueryInfoDB = null; // 建立會用到的 connection
E104Data swdataInfo = null; // 建立會用到的 query容器
String swsqlCommand="";
String mindate = null;
try
{
// 寫檔內容
pw.write("機制於"+new SimpleDateFormat("yyyy/MM/dd HH:mm:ss").format(new Date())+"啟動,其完成動作如下:\n");
// 初始化郵件
//l.MailHost="ex01.e104.com.tw"; // 設定SMTP
logmail.setHost(LocalXml.getLocalTagValue("MailHost"));
logmail.setFrom(LocalXml.getLocalTagValue("MailFrom"));
logmail.setTo(LocalXml.getLocalTagValue("MailTo"));
logmail.setCc(LocalXml.getLocalTagValue("MailTo"));
logmail.setSubject("yahoo search keyword...");
logmail.appendBody("機制於"+new SimpleDateFormat("yyyy/MM/dd HH:mm:ss").format(new Date())+"啟動,其完成動作如下:\n");
System.out.println("機制於"+new SimpleDateFormat("yyyy/MM/dd HH:mm:ss").format(new Date())+"啟動,其完成動作如下:");
//int countermax = Integer.parseInt(args[0]); // 限制條件
// 上線條件要改
sqlCommand = "SELECT autonumber, keyword, counter,apstart, searchdate_y"+
" FROM info_keylog"+
" where "+
" rownum
" and apstart = 1 "+
" ORDER BY searchdate_y ASC";
queryInfoDB = new E104Conn(GlobalXml.getGlobalTagValue("dsn1.driver"),GlobalXml.getGlobalTagValue("dsn1.database"),GlobalXml.getGlobalTagValue("dsn1.username"),GlobalXml.getGlobalTagValue("dsn1.password"),false);
System.out.println("連結INFO資料庫成功...");
logmail.appendBody("連結INFO資料庫成功...\n");
//寫檔內容
pw.write("連結INFO資料庫成功...\n");
// 撈取資料
System.out.println(sqlCommand);
logmail.appendBody("sqlCommand:\n"+sqlCommand+"\n");
pw.write("sqlCommand:\n"+sqlCommand+"\n");
queryInfoDB.setSql(sqlCommand);
dataInfo = queryInfoDB.getData();
// 判斷跑幾筆
int total=dataInfo.getRowCount();
if (total > 40)
total =40;
System.out.println("共"+total+"筆\n資料如下(autonumber,keyword,counter searchdate_y)");
logmail.appendBody("共"+total+"筆\n資料如下(autonumber,keyword,counter searchdate_y)\n");
pw.write("共"+String.valueOf(total)+"筆\n資料如下:\nautonumber keyword counter searchdate_y\n");
// 顯示資料庫中的欄位值
if(dataInfo.getRowCount()!= 0)
for (int i=0;i
{
System.out.print(dataInfo.getCell(i,"autonumber")+" ");
System.out.print(dataInfo.getCell(i,"keyword")+" ");
System.out.print(dataInfo.getCell(i,"counter")+" "); // 取欄位值
System.out.println(dataInfo.getCell(i,"searchdate_y")); // 取欄位值
logmail.appendBody(dataInfo.getCell(i,"autonumber")+" ");
logmail.appendBody(dataInfo.getCell(i,"keyword")+" ");
logmail.appendBody(dataInfo.getCell(i,"counter")+" ");
logmail.appendBody(dataInfo.getCell(i,"searchdate_y")+"\n"); // 取欄位值
pw.write(dataInfo.getCell(i,"autonumber")+" ");
pw.write(dataInfo.getCell(i,"keyword")+" ");
pw.write(dataInfo.getCell(i,"counter")+" ");
pw.write(dataInfo.getCell(i,"searchdate_y")+"\n"); // 取欄位值
}
System.out.println("開始取得網頁內容:");
logmail.appendBody("開始取得網頁內容:\n");
pw.write("開始取得網頁內容:\n");
//System.out.println("proxySet:"+System.getProperties().getProperty("proxySet"));
//System.out.println("proxyHost:"+System.getProperties().getProperty("proxyHost"));
//System.out.println("proxyPort:"+System.getProperties().getProperty("proxyPort"));
// yahoo連接網頁
urlyahoo yahoo = new urlyahoo();
urlyahoo.proxyHost = LocalXml.getLocalTagValue("proxyHost");
urlyahoo.proxyPort = LocalXml.getLocalTagValue("proxyPort");
String keyword="";
yahoohtmlparse yahoohp = new yahoohtmlparse();
Charcode code = new Charcode();
// System.out.println("音樂:"+code.Utf8URLencode("音樂"));
String wkeyword = "";
int wcounter = 0;
int run = 0;
String search="";
for (int i=0;i
{
yahoohp.init();
for (int j=0; j
{
int page=j*10+1;
run = i;
keyword = dataInfo.getCell(i,"keyword");
wcounter = Integer.parseInt(dataInfo.getCell(i,"counter"));
wkeyword = keyword;
keyword = code.Utf8URLencode(keyword);
System.out.println("keyword:"+dataInfo.getCell(i,"keyword")+"-"+keyword);
logmail.appendBody("keyword:"+dataInfo.getCell(i,"keyword")+"-"+keyword+"\n");
pw.write("keyword:"+dataInfo.getCell(i,"keyword")+"-"+keyword+"\n");
search="http://tw.search.yahoo.com/search?p="+keyword+"&ei=UTF-8&b="+page;
System.out.println(search);
logmail.appendBody(search+"\n");
pw.write(search+"\n");
yahoo.connects(search);
yahoo.readContents();
System.out.println("yahoo.wordcount:"+yahoo.wordcount);
logmail.appendBody("yahoo.wordcount:"+yahoo.wordcount+"\n");
pw.write("yahoo.wordcount:"+yahoo.wordcount+"\n");
System.out.println("開始搜尋第"+(j+1)+"頁");
logmail.appendBody("開始搜尋第"+(j+1)+"頁\n");
pw.write("開始搜尋第"+(j+1)+"頁\n");
//System.out.println("yahoo.htmlstr:"+yahoo.htmlstr);
yahoohp.html(yahoo.htmlstr);
System.out.println("error:"+yahoohp.error);
logmail.appendBody("error:"+yahoohp.error+"\n");
pw.write("error:"+yahoohp.error+"\n");
}
//System.out.println("");
//logmail.appendBody("\n");
//pw.write("\n");
/*
for (int k=0;k
{
System.out.println("links["+k+"]:"+yahoohp.links[k]);
System.out.println("titles["+k+"]:"+yahoohp.titles[k]);
System.out.println("articles["+k+"]:"+yahoohp.articles[k]);
logmail.appendBody("links["+k+"]:"+yahoohp.links[k]+"\n");
logmail.appendBody("titles["+k+"]:"+yahoohp.titles[k]+"\n");
logmail.appendBody("articles["+k+"]:"+yahoohp.articles[k]+"\n");
pw.write("links["+k+"]:"+yahoohp.links[k]+"\n");
pw.write("titles["+k+"]:"+yahoohp.titles[k]+"\n");
pw.write("articles["+k+"]:"+yahoohp.articles[k]+"\n");
}
*/
// 每頁多少筆
int kp =0;
for (int k=0;k
{
kp = k + 1;
//System.out.println("");
//logmail.appendBody("\n");
//pw.write("\n");
}
// System.out.println(yahoo.htmlstr);
// 判斷所需字串
int count=0;
for (int z=0;z
{
clinks[z]="";
ctitles[z]="";
carticles[z]="";
}
// 排除重覆的資料
for (int x=0;x
{
for (int y=0;y
{
if (clinks[y].equals("")) // clinks[y]為""時
{
clinks[y]=yahoohp.links[x];
ctitles[y]=yahoohp.titles[x];
carticles[y]=yahoohp.articles[x];
count=count+1;
//System.out.println("clinks["+y+"]"+clinks[y]);
//logmail.appendBody("clinks["+y+"]"+clinks[y]+"\n");
//pw.write("clinks["+y+"]"+clinks[y]+"\n");
//System.out.println("ctitles["+y+"]"+ctitles[y]);
//logmail.appendBody("ctitles["+y+"]"+ctitles[y]+"\n");
//pw.write("ctitles["+y+"]"+ctitles[y]+"\n");
//System.out.println("carticles["+y+"]"+carticles[y]);
//logmail.appendBody("carticles["+y+"]"+carticles[y]+"\n");
//pw.write("carticles["+y+"]"+carticles[y]+"\n");
break;
}
else // clinks[y]有值時
{
if (yahoohp.links[x].equals(clinks[y])) // 一樣
{
break;
}
else // 不一樣
{
continue;
}
}
}
}
System.out.println("count="+count);
logmail.appendBody("count="+count+"\n");
pw.write("count="+count+"\n");
System.out.println("");
logmail.appendBody(""+"\n");
pw.write(""+"\n");
int sum = count;
int to = 0;
// 設定資料庫連結
wqueryInfoDB = new E104Conn(GlobalXml.getGlobalTagValue("dsn1.driver"),GlobalXml.getGlobalTagValue("dsn1.database"),GlobalXml.getGlobalTagValue("dsn1.username"),GlobalXml.getGlobalTagValue("dsn1.password"),false);
// 資料處理
for (int p=0;p
{
int pcount =0; // 計算頁數值
int change =0; // 計算目前筆數
pcount=p+1;
if (sum == -1) // sum>=0代表還有資料,-1表結束
break;
if (sum > 0)
{
sum = sum - yahoohp.page[p];
if (sum
sum = 0;
}
System.out.println("第"+pcount+"頁");
logmail.appendBody("第"+pcount+"頁\n");
pw.write("第"+pcount+"頁\n");
System.out.println("sum="+sum);
logmail.appendBody("sum="+sum+"\n");
pw.write("sum="+sum+"\n");
if (sum >=0 || sum != -1)
{
for (int c=0;c
{
if (to == count)
{
System.out.println("結束");
logmail.appendBody("結束\n");
pw.write("結束\n");
break;
}
int slot = c+1; // 資料庫排序值
System.out.println("筆數"+change);
logmail.appendBody("筆數"+change+"\n");
pw.write("筆數"+change+"\n");
if (change == yahoohp.page[p]) // 記算筆數是否為頁筆數
{
System.out.println("換頁");
logmail.appendBody("換頁\n");
pw.write("換頁\n");
break;
}
System.out.println("clinks["+to+"]="+clinks[to]);
System.out.println("ctitles["+to+"]="+ctitles[to]);
System.out.println("carticles["+to+"]="+carticles[to]);
logmail.appendBody("\nclinks["+to+"]="+clinks[to]);
logmail.appendBody("\nctitles["+to+"]="+ctitles[to]);
logmail.appendBody("\ncarticles["+to+"]="+carticles[to]);
pw.write("\nclinks["+to+"]="+clinks[to]);
pw.write("\nctitles["+to+"]="+ctitles[to]);
pw.write("\ncarticles["+to+"]="+carticles[to]);
// 寫入資料庫
// max=max+1;
/*
System.out.println("max:"+max);
System.out.println("domain_name:"+clinks[to]);
System.out.println("keyword:"+wkeyword);
System.out.println("pagenum:"+pcount);
System.out.println("slot:"+slot);
System.out.println("adtitle:"+ctitles[to]);
System.out.println("adtext:"+carticles[to]);
System.out.println("engine:y");
System.out.println("create_date:syadate");
System.out.println("counter:"+wcounter);
*/
//判斷資料是否存在,若存在用更新的方法…
swsqlCommand = "SELECT auto_no, keyword,domain_name "+
" FROM over_monitor "+
" where "+
" keyword ='"+wkeyword+"' "+
" and domain_name ='"+clinks[to]+"' "+
" and engine = 'y'";
//" ORDER BY auto_no desc";
swqueryInfoDB = new E104Conn(GlobalXml.getGlobalTagValue("dsn1.driver"),GlobalXml.getGlobalTagValue("dsn1.database"),GlobalXml.getGlobalTagValue("dsn1.username"),GlobalXml.getGlobalTagValue("dsn1.password"),false);
swqueryInfoDB.setSql(swsqlCommand);
swdataInfo = swqueryInfoDB.getData();
System.out.println("存在筆數:"+swdataInfo.getRowCount());
if (swdataInfo.getRowCount() ==0)
{
wsqlCommand =
"insert into over_monitor "+
" (auto_no,domain_name,keyword,pagenum,slot,adtitle,adtext,engine,create_date,counter)"+
" values( seq_over_monitor.nextval ,'"+clinks[to]+"' ,'"+wkeyword+"' ,"+pcount+" ,"+slot+" ,'"+ctitles[to]+"' ,'"+carticles[to]+"' ,'y' ,sysdate,"+wcounter+")";
//撈取資料
System.out.println("keyword:"+wkeyword);
logmail.appendBody("\nkeyword:"+wkeyword+"\n");
pw.write("\nkeyword:"+wkeyword+"\n");
System.out.println("write:"+wsqlCommand);
logmail.appendBody("write:\n"+wsqlCommand+"\n");
pw.write("write:\n"+wsqlCommand+"\n");
wqueryInfoDB.setSql(wsqlCommand);
wdataInfo = wqueryInfoDB.getData();
}
/*
else
{
wsqlCommand =
"update over_monitor"+
" set pagenum="+pcount+", "+
" slot ="+slot+", "+
" adtitle='"+ctitles[to]+"', "+
" adtext='"+carticles[to]+"', "+
" engine='y' ,"+
" create_date=sysdate ,"+
" counter="+wcounter+
" where auto_no="+swdataInfo.getCell(0,"auto_no");
}
*/
to=to+1;
change=change+1;
}
if (sum == 0)
{
sum=-1;
wsqlCommand =
"update info_keylog "+
" set searchdate_y =sysdate "+
" where autonumber="+dataInfo.getCell(i,"autonumber");
System.out.println("info_keylog:"+wsqlCommand);
logmail.appendBody("info_keylog:\n"+wsqlCommand+"\n");
pw.write("info_keylog:\n"+wsqlCommand+"\n");
wqueryInfoDB.setSql(wsqlCommand);
wdataInfo = wqueryInfoDB.getData();
}
}
}
pw.flush();
System.out.println("");
logmail.appendBody("\n");
pw.write("\n");
count =0;
yahoohp.init();
System.out.println("error:"+yahoohp.error);
logmail.appendBody("error:"+yahoohp.error+"\n");
pw.write("error:"+yahoohp.error+"\n");
}
// out.write("htmlstr:"+hp.htmlstr);
}
catch (Exception e)
{
System.out.println(e.getMessage());
logmail.appendBody(e.getMessage());
pw.write(e.getMessage());
System.out.println("系統有誤未完成,請通知INFO的SA!!");
System.out.println("Message="+e.getMessage());
System.out.println("Exception="+e.toString());
logmail.appendBody("系統有誤未完成,請通知INFO的SA!!\n");
logmail.appendBody("Message="+e.getMessage()+"\n");
logmail.appendBody("Exception="+e.toString()+"\n");
pw.write("系統有誤未完成,請通知INFO的SA!!\n");
pw.write("Message="+e.getMessage()+"\n");
pw.write("Exception="+e.toString()+"\n");
pw.flush();
}
finally
{
//關閉連線,釋放資源
System.out.println("釋放所有資源!!");
logmail.appendBody("釋放所有資源!!\n");
pw.write("釋放所有資源!!\n");
System.out.println("機制於"+new SimpleDateFormat("yyyy/MM/dd HH:mm:ss").format(new Date())+"結束");
logmail.appendBody("機制於"+new SimpleDateFormat("yyyy/MM/dd HH:mm:ss").format(new Date())+"結束\n");
pw.write("機制於"+new SimpleDateFormat("yyyy/MM/dd HH:mm:ss").format(new Date()).toString()+"結束\n");
//寄信
logmail.send();
dataInfo = null;
logmail = null;
log = null;
sqlCommand = null;
wsqlCommand = null;
queryInfoDB.close();
wqueryInfoDB.close();
pw.flush();
pw.close();
fw.close();
}
}
}
//big5轉url編碼
class Charcode {
/**
* 轉換編碼 ISO-8859-1到GB2312
* @param text
* @return
*/
public String ISO2GB(String text) {
String result = "";
try {
result = new String(text.getBytes("ISO-8859-1"), "GB2312");
}
catch (UnsupportedEncodingException ex) {
result = ex.toString();
}
return result;
}
/**
* 轉換編碼 GB2312到ISO-8859-1
* @param text
* @return
*/
public String GB2ISO(String text) {
String result = "";
try {
result = new String(text.getBytes("GB2312"), "ISO-8859-1");
}
catch (UnsupportedEncodingException ex) {
ex.printStackTrace();
}
return result;
}
/**
* Utf8URL編碼
* @param s
* @return
*/
public String Utf8URLencode(String text) {
StringBuffer result = new StringBuffer();
for (int i = 0; i
char c = text.charAt(i);
if (c >= 0 && c
result.append(c);
}else {
byte[] b = new byte[0];
try {
b = Character.toString(c).getBytes("UTF-8");
}catch (Exception ex) {
}
for (int j = 0; j
int k = b[j];
if (k
result.append("%" + Integer.toHexString(k).toUpperCase());
}
}
}
return result.toString();
}
/**
* Utf8URL解碼
* @param text
* @return
*/
public String Utf8URLdecode(String text) {
String result = "";
int p = 0;
if (text!=null && text.length()>0){
text = text.toLowerCase();
p = text.indexOf("%e");
if (p == -1) return text;
while (p != -1) {
result += text.substring(0, p);
text = text.substring(p, text.length());
if (text == "" || text.length()
result += CodeToWord(text.substring(0, 9));
text = text.substring(9, text.length());
p = text.indexOf("%e");
}
}
return result + text;
}
/**
* utf8URL編碼轉字符
* @param text
* @return
*/
private String CodeToWord(String text) {
String result;
if (Utf8codeCheck(text)) {
byte[] code = new byte[3];
code[0] = (byte) (Integer.parseInt(text.substring(1, 3), 16) - 256);
code[1] = (byte) (Integer.parseInt(text.substring(4, 6), 16) - 256);
code[2] = (byte) (Integer.parseInt(text.substring(7, 9), 16) - 256);
try {
result = new String(code, "UTF-8");
}catch (UnsupportedEncodingException ex) {
result = null;
}
}
else {
result = text;
}
return result;
}
/**
* 編碼是否有效
* @param text
* @return
*/
private boolean Utf8codeCheck(String text){
String sign = "";
if (text.startsWith("%e"))
for (int i = 0, p = 0; p != -1; i++) {
p = text.indexOf("%", p);
if (p != -1)
p++;
sign += p;
}
return sign.equals("147-1");
}
/**
* 是否Utf8Url編碼
* @param text
* @return
*/
public boolean isUtf8Url(String text) {
text = text.toLowerCase();
int p = text.indexOf("%");
if (p != -1 && text.length() - p > 9) {
text = text.substring(p, p + 9);
}
return Utf8codeCheck(text);
}
}
//連接yahoo網頁取得內容
class urlyahoo implements Runnable
{
String htmlstr=""; //儲存網頁內容
int wordcount=0; //計算字數
//URLConnection URLConn;
//URLConnection URLConn;
HttpURLConnection http;
static String proxyHost= null;
static String proxyPort= null;
public void connects( String urlString ) {
try {
// 設proxy
/*
Properties systemProperties = System.getProperties();
systemProperties.put("proxySet","true");
systemProperties.put("http.proxyHost",proxyHost);
systemProperties.put("http.proxyPort",proxyPort);
Syst