import java.io.*;
import java.net.*;
import java.text.SimpleDateFormat;
import java.util.*;
import com.e104.util.*;
import com.e104.util.E104Mail;
import com.e104.db.*;
import com.e104.util.E104XmlGlobalHandler;
import com.e104.util.E104XmlLocalHandler;
/**
* @Apid 472
* @author kevin.huang
* @email kevin.huang@104.com.tw
* @description 解析xml檔程式
* @date 2006/9/18
* @description 傳送Email、寫log檔、搜詢資料庫
* @date 2006/9/19
* @description 連接網頁內容取回部份內容值
* @date 2006/9/20
* @description 解析google網頁完成
* @date 2006/9/26
* @description 資料寫入資料庫成功
* @date 2006/9/28
* @description 增加防錯測試
* @date 2006/10/1
* @description 修改查詢語法
* @date 2006/11/30
* @description 修改xml語法
*/
public class Search_Google
{
E104XmlLocalHandler localXML = null;
E104XmlGlobalHandler globalXML = null;
public static void main(String[] args) throws Exception
{
String log = null;
//儲存所需的正確資料(已排除重覆)
String clinks[] = new String[100];
String ctitles[] = new String[100];
String carticles[] = new String[100];
String dirName = null;
String fileName = null;
FileWriter fw = null;
PrintWriter pw = null;
// xml解析
//log = g.startparse();
//l.startparse();
//設定xml
E104XmlLocalHandler LocalXml = E104XmlLocalHandler.performParser();
E104XmlGlobalHandler GlobalXml = E104XmlGlobalHandler.performParser(472, LocalXml.getLocalTagValue("GlobalPtah"));
log = GlobalXml.getGlobalTagValue("apini.logpath");
System.out.println("logpath="+log);
//建立資料夾
File myDir = new File(log);
myDir.mkdir();
System.out.println(myDir+(myDir.isDirectory()?" is":" is not")+" a directory.");
//建檔
dirName = log;
fileName = "Search_Google_mapping_" + new SimpleDateFormat("yyyy.MM.dd-'T'HH.mm.ss").format(new Date()) + ".log";
File output = new File (dirName,fileName);
output.createNewFile();
//System.out.println(output.getPath());
fw = new FileWriter(output.getPath(),true);
pw = new PrintWriter(fw);
E104Mail logmail = new E104Mail(); //定義寄送mail物件
//資料庫初始設定
//搜尋時用
E104Conn queryInfoDB = null; // 建立會用到的 connection
E104Data dataInfo = null; // 建立會用到的 query容器
String sqlCommand="";
//寫入時用
E104Conn wqueryInfoDB = null; // 建立會用到的 connection
E104Data wdataInfo = null; // 建立會用到的 query容器
String wsqlCommand="";
//寫入over_monitor時判斷用
E104Conn swqueryInfoDB = null; // 建立會用到的 connection
E104Data swdataInfo = null; // 建立會用到的 query容器
String swsqlCommand="";
String mindate = null;
try
{
//寫檔內容
pw.write("機制於"+new SimpleDateFormat("yyyy/MM/dd HH:mm:ss").format(new Date())+"啟動,其完成動作如下:\n");
//初始化郵件
//l.MailHost="ex01.e104.com.tw"; //設定SMTP
logmail.setHost(LocalXml.getLocalTagValue("MailHost"));
logmail.setFrom(LocalXml.getLocalTagValue("MailFrom"));
logmail.setTo(LocalXml.getLocalTagValue("MailTo"));
//logmail.setCc(LocalXml.getLocalTagValue("MailCc"));
logmail.setSubject("google search keyword...");
logmail.appendBody("機制於"+new SimpleDateFormat("yyyy/MM/dd HH:mm:ss").format(new Date())+"啟動,其完成動作如下:\n");
//int countermax = Integer.parseInt(args[0]); //限制條件
// 上線條件要改
sqlCommand = "SELECT autonumber, keyword, counter,apstart, searchdate_g"+
" FROM info_keylog"+
" where "+
" rownum
" and apstart = 1 "+
" ORDER BY searchdate_g ASC";
//driver = GlobalXml.getGlobalTagValue("dsn1.driver");
//System.out.println("driver:"+driver);
queryInfoDB = new E104Conn(GlobalXml.getGlobalTagValue("dsn1.driver"),GlobalXml.getGlobalTagValue("dsn1.database"),GlobalXml.getGlobalTagValue("dsn1.username"),GlobalXml.getGlobalTagValue("dsn1.password"),false);
//System.out.println("連結INFO資料庫成功...");
//logmail.appendBody("連結INFO資料庫成功...\n");
//寫檔內容
//pw.write("連結INFO資料庫成功...\n");
//撈取資料
System.out.println(sqlCommand);
logmail.appendBody("sqlCommand:\n"+sqlCommand+"\n");
pw.write("sqlCommand:\n"+sqlCommand+"\n");
queryInfoDB.setSql(sqlCommand);
dataInfo = queryInfoDB.getData();
//判斷跑幾筆
int total=dataInfo.getRowCount();
if (total > 40)
total =40;
System.out.println("共"+total+"筆\n資料如下(autonumber,keyword,counter searchdate_g)");
logmail.appendBody("共"+total+"筆\n資料如下(autonumber,keyword,counter searchdate_g)\n");
pw.write("共"+total+"筆\n資料如下:\nautonumber keyword counter searchdate_g\n");
// 顯示資料庫中的欄位值
if(dataInfo.getRowCount()!= 0)
for (int i=0;i
{
System.out.print(dataInfo.getCell(i,"autonumber")+" ");
System.out.print(dataInfo.getCell(i,"keyword")+" ");
System.out.print(dataInfo.getCell(i,"counter")+" "); // 取欄位值
System.out.println(dataInfo.getCell(i,"searchdate_g")); // 取欄位值
logmail.appendBody(dataInfo.getCell(i,"autonumber")+" ");
logmail.appendBody(dataInfo.getCell(i,"keyword")+" ");
logmail.appendBody(dataInfo.getCell(i,"counter")+" ");
logmail.appendBody(dataInfo.getCell(i,"searchdate_g")+"\n"); // 取欄位值
pw.write(dataInfo.getCell(i,"autonumber")+" ");
pw.write(dataInfo.getCell(i,"keyword")+" ");
pw.write(dataInfo.getCell(i,"counter")+" ");
pw.write(dataInfo.getCell(i,"searchdate_g")+"\n"); // 取欄位值
}
System.out.println("開始取得網頁內容:");
logmail.appendBody("開始取得網頁內容:\n");
pw.write("開始取得網頁內容:\n");
//google連接網頁
urlgoogle google = new urlgoogle();
urlgoogle.proxyHost = LocalXml.getLocalTagValue("proxyHost");
urlgoogle.proxyPort = LocalXml.getLocalTagValue("proxyPort");
String keyword="";
googlehtmlparse googlehp = new googlehtmlparse();
Charcode code = new Charcode();
//System.out.println("音樂:"+code.Utf8URLencode("音樂"));
String wkeyword = "";
int wcounter = 0;
for (int i=0;i
{
googlehp.init();
//keyword = dataInfo.getCell(i,"keyword");
for(int j=0; j
{
int page=j*10;
keyword = dataInfo.getCell(i,"keyword");
wcounter = Integer.parseInt(dataInfo.getCell(i,"counter"));
wkeyword = keyword;
keyword = code.Utf8URLencode(keyword);
System.out.println("keyword:"+dataInfo.getCell(i,"keyword")+"-"+keyword);
logmail.appendBody("keyword:"+dataInfo.getCell(i,"keyword")+"-"+keyword+"\n");
pw.write("keyword:"+dataInfo.getCell(i,"keyword")+"-"+keyword+"\n");
keyword="http://www.google.com.tw/search?hl=zh-TW&q="+keyword+"&start="+page;
System.out.println("keyword:"+keyword);
logmail.appendBody("keyword:"+keyword+"\n");
pw.write("keyword:"+keyword+"\n");
google.connect(keyword);
google.readContents();
System.out.println("google.wordcount:"+google.wordcount);
logmail.appendBody("google.wordcount:"+google.wordcount+"\n");
pw.write("google.wordcount:"+google.wordcount+"\n");
System.out.println("開始搜尋第"+(j+1)+"頁");
logmail.appendBody("開始搜尋第"+(j+1)+"頁\n");
pw.write("開始搜尋第"+(j+1)+"頁\n");
googlehp.html(google.htmlstr);
System.out.println("error:"+googlehp.error);
logmail.appendBody("error:"+googlehp.error+"\n");
pw.write("error:"+googlehp.error+"\n");
if (googlehp.error == 1)
break;
}
//System.out.println("");
//logmail.appendBody("\n");
//pw.write("\n");
/*
for (int k=0;k
{
System.out.println("links["+k+"]:"+googlehp.links[k]);
System.out.println("titles["+k+"]:"+googlehp.titles[k]);
System.out.println("articles["+k+"]:"+googlehp.articles[k]);
logmail.appendBody("links["+k+"]:"+googlehp.links[k]+"\n");
logmail.appendBody("titles["+k+"]:"+googlehp.titles[k]+"\n");
logmail.appendBody("articles["+k+"]:"+googlehp.articles[k]+"\n");
pw.write("links["+k+"]:"+googlehp.links[k]+"\n");
pw.write("titles["+k+"]:"+googlehp.titles[k]+"\n");
pw.write("articles["+k+"]:"+googlehp.articles[k]+"\n");
}
*/
//每頁多少筆
int kp =0;
for (int k=0;k
{
kp = k + 1;
//System.out.println("");
//logmail.appendBody("\n");
//pw.write("\n");
}
//判斷所需字串
int count=0;
for (int z=0;z
{
clinks[z]="";
ctitles[z]="";
carticles[z]="";
}
//排除重覆的資料
System.out.println("linkcount:"+googlehp.linkcount);
logmail.appendBody("linkcount:"+googlehp.linkcount+"\n");
pw.write("linkcount:"+googlehp.linkcount+"\n");
for (int x=0;x
{
for (int y=0;y
{
if (clinks[y].equals("")) //clinks[y]為""時
{
clinks[y]=googlehp.links[x];
ctitles[y]=googlehp.titles[x];
carticles[y]=googlehp.articles[x];
count=count+1;
//System.out.println("clinks["+y+"]"+clinks[y]);
//logmail.appendBody("clinks["+y+"]"+clinks[y]+"\n");
//pw.write("clinks["+y+"]"+clinks[y]+"\n");
//System.out.println("ctitles["+y+"]"+ctitles[y]);
//logmail.appendBody("ctitles["+y+"]"+ctitles[y]+"\n");
//pw.write("ctitles["+y+"]"+ctitles[y]+"\n");
//System.out.println("carticles["+y+"]"+carticles[y]);
//logmail.appendBody("carticles["+y+"]"+carticles[y]+"\n");
//pw.write("carticles["+y+"]"+carticles[y]+"\n");
break;
}
else //clinks[y]有值時
{
if (googlehp.links[x].equals(clinks[y])) //一樣
{
break;
}
else //不一樣
{
continue;
}
}
}
}
System.out.println("count="+count);
logmail.appendBody("count="+count+"\n");
pw.write("count="+count+"\n");
System.out.println("");
logmail.appendBody(""+"\n");
pw.write(""+"\n");
int sum = count;
int to = 0;
//設定資料庫連結
wqueryInfoDB = new E104Conn(GlobalXml.getGlobalTagValue("dsn1.driver"),GlobalXml.getGlobalTagValue("dsn1.database"),GlobalXml.getGlobalTagValue("dsn1.username"),GlobalXml.getGlobalTagValue("dsn1.password"),false);
//資料處理
for (int p=0;p
{
int pcount =0; //計算頁數值
int change =0; //計算目前筆數
pcount=p+1;
if (sum == -1) //sum>=0代表還有資料,-1表結束
break;
if (sum > 0)
{
sum = sum - googlehp.page[p];
if (sum
sum = 0;
}
System.out.println("第"+pcount+"頁");
logmail.appendBody("第"+pcount+"頁\n");
pw.write("第"+pcount+"頁\n");
System.out.println("sum="+sum);
logmail.appendBody("sum="+sum+"\n");
pw.write("sum="+sum+"\n");
if (sum >=0 || sum != -1)
{
for (int c=0;c
{
if (to == count)
{
System.out.println("結束");
logmail.appendBody("結束\n");
pw.write("結束\n");
break;
}
int slot = c+1; //資料庫排序值
System.out.println("筆數"+change);
logmail.appendBody("筆數"+change+"\n");
pw.write("筆數"+change+"\n");
if (change == googlehp.page[p]) //記算筆數是否為頁筆數
{
System.out.println("換頁");
logmail.appendBody("換頁\n");
pw.write("換頁\n");
break;
}
System.out.println("clinks["+to+"]="+clinks[to]);
System.out.println("ctitles["+to+"]="+ctitles[to]);
System.out.println("carticles["+to+"]="+carticles[to]);
logmail.appendBody("\nclinks["+to+"]="+clinks[to]);
logmail.appendBody("\nctitles["+to+"]="+ctitles[to]);
logmail.appendBody("\ncarticles["+to+"]="+carticles[to]);
pw.write("\nclinks["+to+"]="+clinks[to]);
pw.write("\nctitles["+to+"]="+ctitles[to]);
pw.write("\ncarticles["+to+"]="+carticles[to]);
//寫入資料庫
//max=max+1;
/*
System.out.println("max:"+max);
System.out.println("domain_name:"+clinks[to]);
System.out.println("keyword:"+wkeyword);
System.out.println("pagenum:"+pcount);
System.out.println("slot:"+slot);
System.out.println("adtitle:"+ctitles[to]);
System.out.println("adtext:"+carticles[to]);
System.out.println("engine:g");
System.out.println("create_date:syadate");
System.out.println("counter:"+wcounter);
*/
//判斷資料是否存在,若存在用更新的方法…
swsqlCommand = "SELECT auto_no, keyword,domain_name "+
" FROM over_monitor "+
" where "+
" keyword ='"+wkeyword+"' "+
" and domain_name ='"+clinks[to]+"' "+
" and engine = 'g'";
//" ORDER BY auto_no desc";
swqueryInfoDB = new E104Conn(GlobalXml.getGlobalTagValue("dsn1.driver"),GlobalXml.getGlobalTagValue("dsn1.database"),GlobalXml.getGlobalTagValue("dsn1.username"),GlobalXml.getGlobalTagValue("dsn1.password"),false);
swqueryInfoDB.setSql(swsqlCommand);
swdataInfo = swqueryInfoDB.getData();
System.out.println("存在筆數:"+swdataInfo.getRowCount());
if (swdataInfo.getRowCount() ==0)
{
wsqlCommand =
"insert into over_monitor "+
" (auto_no,domain_name,keyword,pagenum,slot,adtitle,adtext,engine,create_date,counter)"+
" values( seq_over_monitor.nextval ,'"+clinks[to]+"' ,'"+wkeyword+"' ,"+pcount+" ,"+slot+" ,'"+ctitles[to]+"' ,'"+carticles[to]+"' ,'g' ,sysdate,"+wcounter+")";
//撈取資料
System.out.println("keyword:"+wkeyword);
logmail.appendBody("keyword:"+wkeyword+"\n");
pw.write("keyword:"+wkeyword+"\n");
System.out.println("write:"+wsqlCommand);
logmail.appendBody("write:\n"+wsqlCommand+"\n");
pw.write("write:\n"+wsqlCommand+"\n");
wqueryInfoDB.setSql(wsqlCommand);
wdataInfo = wqueryInfoDB.getData();
}
/*
else
{
wsqlCommand =
"update over_monitor"+
" set pagenum="+pcount+", "+
" slot ="+slot+", "+
" adtitle='"+ctitles[to]+"', "+
" adtext='"+carticles[to]+"', "+
" engine='g' ,"+
" create_date=sysdate ,"+
" counter="+wcounter+
" where auto_no="+swdataInfo.getCell(0,"auto_no");
}
*/
to=to+1;
change=change+1;
}
if (sum == 0)
{
sum=-1;
wsqlCommand =
"update info_keylog "+
" set searchdate_g =sysdate "+
" where autonumber="+dataInfo.getCell(i,"autonumber");
System.out.println("info_keylog:"+wsqlCommand);
logmail.appendBody("info_keylog:\n"+wsqlCommand+"\n");
pw.write("info_keylog:\n"+wsqlCommand+"\n");
wqueryInfoDB.setSql(wsqlCommand);
wdataInfo = wqueryInfoDB.getData();
}
}
}
pw.flush();
System.out.println("");
logmail.appendBody("\n");
pw.write("\n");
count =0;
googlehp.init();
//System.out.println("error:"+googlehp.error);
//logmail.appendBody("error:"+googlehp.error+"\n");
//pw.write("error:"+googlehp.error+"\n");
}
//out.write("htmlstr:"+hp.htmlstr);
}
catch (Exception e)
{
System.out.println(e.getMessage());
System.out.println("系統有誤未完成,請通知INFO的SA!!");
System.out.println("Message="+e.getMessage());
System.out.println("Exception="+e.toString());
e.printStackTrace();
logmail.appendBody("系統有誤未完成,請通知INFO的SA!!\n");
logmail.appendBody("Message="+e.getMessage()+"\n");
logmail.appendBody("Exception="+e.toString()+"\n");
pw.write("系統有誤未完成,請通知INFO的SA!!\n");
pw.write("Message="+e.getMessage()+"\n");
pw.write("Exception="+e.toString()+"\n");
pw.flush();
}
finally
{
//關閉連線,釋放資源
System.out.println("釋放所有資源!!");
logmail.appendBody("釋放所有資源!!\n");
pw.write("釋放所有資源!!\n");
System.out.println("機制於"+new SimpleDateFormat("yyyy/MM/dd HH:mm:ss").format(new Date())+"結束");
logmail.appendBody("機制於"+new SimpleDateFormat("yyyy/MM/dd HH:mm:ss").format(new Date())+"結束\n");
pw.write("機制於"+new SimpleDateFormat("yyyy/MM/dd HH:mm:ss").format(new Date())+"結束\n");
//寄信
logmail.send();
dataInfo = null;
logmail = null;
log = null;
sqlCommand = null;
wsqlCommand = null;
//queryInfoDB.close();
//wqueryInfoDB.close();
pw.flush();
pw.close();
fw.close();
}
}
}
//big5轉url編碼
class Charcode {
/**
* 轉換編碼 ISO-8859-1到GB2312
* @param text
* @return
*/
public String ISO2GB(String text) {
String result = "";
try {
result = new String(text.getBytes("ISO-8859-1"), "GB2312");
}
catch (UnsupportedEncodingException ex) {
result = ex.toString();
}
return result;
}
/**
* 轉換編碼 GB2312到ISO-8859-1
* @param text
* @return
*/
public String GB2ISO(String text) {
String result = "";
try {
result = new String(text.getBytes("GB2312"), "ISO-8859-1");
}
catch (UnsupportedEncodingException ex) {
ex.printStackTrace();
}
return result;
}
/**
* Utf8URL編碼
* @param s
* @return
*/
public String Utf8URLencode(String text) {
StringBuffer result = new StringBuffer();
for (int i = 0; i
char c = text.charAt(i);
if (c >= 0 && c
result.append(c);
}else {
byte[] b = new byte[0];
try {
b = Character.toString(c).getBytes("UTF-8");
}catch (Exception ex) {
}
for (int j = 0; j
int k = b[j];
if (k
result.append("%" + Integer.toHexString(k).toUpperCase());
}
}
}
return result.toString();
}
/**
* Utf8URL解碼
* @param text
* @return
*/
public String Utf8URLdecode(String text) {
String result = "";
int p = 0;
if (text!=null && text.length()>0){
text = text.toLowerCase();
p = text.indexOf("%e");
if (p == -1) return text;
while (p != -1) {
result += text.substring(0, p);
text = text.substring(p, text.length());
if (text == "" || text.length()
result += CodeToWord(text.substring(0, 9));
text = text.substring(9, text.length());
p = text.indexOf("%e");
}
}
return result + text;
}
/**
* utf8URL編碼轉字符
* @param text
* @return
*/
private String CodeToWord(String text) {
String result;
if (Utf8codeCheck(text)) {
byte[] code = new byte[3];
code[0] = (byte) (Integer.parseInt(text.substring(1, 3), 16) - 256);
code[1] = (byte) (Integer.parseInt(text.substring(4, 6), 16) - 256);
code[2] = (byte) (Integer.parseInt(text.substring(7, 9), 16) - 256);
try {
result = new String(code, "UTF-8");
}catch (UnsupportedEncodingException ex) {
result = null;
}
}
else {
result = text;
}
return result;
}
/**
* 編碼是否有效
* @param text
* @return
*/
private boolean Utf8codeCheck(String text){
String sign = "";
if (text.startsWith("%e"))
for (int i = 0, p = 0; p != -1; i++) {
p = text.indexOf("%", p);
if (p != -1)
p++;
sign += p;
}
return sign.equals("147-1");
}
/**
* 是否Utf8Url編碼
* @param text
* @return
*/
public boolean isUtf8Url(String text) {
text = text.toLowerCase();
int p = text.indexOf("%");
if (p != -1 && text.length() - p > 9) {
text = text.substring(p, p + 9);
}
return Utf8codeCheck(text);
}
}
//連接google網頁取得內容
class urlgoogle implements Runnable
{
String htmlstr=""; //儲存網頁內容
int wordcount=0; //計算字數
HttpURLConnection URLConn;
static String proxyHost= null;
static String proxyPort= null;
public void connect( String urlString ) {
try {
//設proxy
/*
Properties systemProperties = System.getProperties();
systemProperties.put("proxySet","true");
systemProperties.put("http.proxyHost",proxyHost);
systemProperties.put("http.proxyPort",proxyPort);
System.setProperties(systemProperties);
*/
InetSocketAddress ISA = new java.net.InetSocketAddress("172.16.254.1",3128);
Proxy proxy = new java.net.Proxy(java.net.Proxy.Type.HTTP,ISA);
URL url = new URL(urlString);
//connection = url.openConnection();
URLConn = (HttpURLConnection)url.openConnection(proxy);
URLConn.setRequestProperty("User-agent","IE/6.0");
URLConn.setDoOutput(false);
URLConn.setDoInput(true);
URLConn.setRequestMethod("GET");
//URLConn.usingProxy();
URLConn.connect();
//System.out.println("是否透過proxy連線:"+URLConn.usingProxy());
//URLConn.setConnectTimeout((int)3000);
//URLConn.setReadTimeout((int)3000);
} catch (MalformedURLException e){
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} catch (Exception e){
e.printStackTrace();
}
}
public void readContents() throws Exception {
try {
BufferedReader in=new BufferedReader(new InputStreamReader(URLConn.getInputStream()));
String inputLine;
htmlstr="";
while (
(inputLine = in.readLine()) != null) {
//System.out.println(inputLine);
htmlstr=htmlstr+inputLine;
wordcount = wordcount +inputLine.length();
}
Thread.sleep(5000);
//System.out.println("wordcount:"+wordcount);
//System.out.println("htmlstr:\n"+htmlstr);
//System.out.println("htmlstrcount:"+htmlstr.length());
} catch (IOException e) {
e.printStackTrace();
} catch (Exception e){
e.printStackTrace();
}
}
public void run() {