import java.io.*;

import java.net.*;




import java.text.SimpleDateFormat;

import java.util.*;

import java.lang.String;




import com.e104.util.*;

import com.e104.db.*;


 


/**

 * @Apid 471

 * @author kevin.huang

 * @email kevin.huang@104.com.tw

 * @description 解析xml檔程式

 * @date 2006/9/18

 * @description 傳送Email、寫log檔、搜詢資料庫

 * @date 2006/9/19

 * @description 連接網頁內容取回部份內容值

 * @date 2006/9/20

 * @description 解析yahoo網頁完成

 * @date 2006/9/21

 * @description 資料寫入資料庫成功

 * @date 2006/9/28

 * @description 增加防錯測試

 * @date 2006/10/1

 * @description 修改查詢語法

 * @date 2006/11/28

 * @date 2006/11/30

 * @description 修改xml語法

 */


public class Search_Yahoo

{


 public static void main(String[] args) throws Exception 

 {

  String log = "";

  

  //儲存所需的正確資料(已排除重覆)

  String clinks[] = new String[100];

  String ctitles[] = new String[100];

  String carticles[] = new String[100];

  


  

  String dirName = null;

   String fileName = null;

   FileWriter fw = null;

   PrintWriter pw = null;

   

  //  xml解析

  //log = g.startparse();

  //l.startparse();


   //設定xml

   E104XmlLocalHandler LocalXml = E104XmlLocalHandler.performParser();

   E104XmlGlobalHandler GlobalXml = E104XmlGlobalHandler.performParser(471, LocalXml.getLocalTagValue("GlobalPtah"));

         

  

        log = GlobalXml.getGlobalTagValue("apini.logpath");

        System.out.println("logpath="+log);

  

  

  //建立資料夾

  File myDir = new File(log);

  myDir.mkdir();

  System.out.println(myDir+(myDir.isDirectory()?" is":" is not")+" a directory.");

  

  //建檔

  dirName = log;

  fileName = "Search_Yahoo_mapping_" + new SimpleDateFormat("yyyy.MM.dd-'T'HH.mm.ss").format(new Date()) + ".log";

  File output = new File (dirName,fileName);

  output.createNewFile();

  //System.out.println(output.getPath());

   fw = new FileWriter(output.getPath(),true);

   pw = new PrintWriter(fw);

  

  

  

  E104Mail logmail = new E104Mail();       //定義寄送mail物件

  

  //資料庫初始設定

  //搜尋時用

  E104Conn queryInfoDB = null;             // 建立會用到的 connection

  E104Data dataInfo = null;                // 建立會用到的 query容器

  String sqlCommand="";

  

  //寫入時用

  E104Conn wqueryInfoDB = null;             // 建立會用到的 connection

  E104Data wdataInfo = null;                // 建立會用到的 query容器

  String wsqlCommand="";

  


  

  //寫入over_monitor時判斷用

  E104Conn swqueryInfoDB = null;             // 建立會用到的 connection

  E104Data swdataInfo = null;                // 建立會用到的 query容器

  String swsqlCommand="";

  

  

  String mindate = null;

  try

  {

   

  

   // 寫檔內容

   pw.write("機制於"+new SimpleDateFormat("yyyy/MM/dd HH:mm:ss").format(new Date())+"啟動,其完成動作如下:\n");

  

   // 初始化郵件

   

   //l.MailHost="ex01.e104.com.tw";  // 設定SMTP

   

    logmail.setHost(LocalXml.getLocalTagValue("MailHost"));

   logmail.setFrom(LocalXml.getLocalTagValue("MailFrom"));

   logmail.setTo(LocalXml.getLocalTagValue("MailTo"));

   logmail.setCc(LocalXml.getLocalTagValue("MailTo"));

   logmail.setSubject("yahoo search keyword...");

  

   logmail.appendBody("機制於"+new SimpleDateFormat("yyyy/MM/dd HH:mm:ss").format(new Date())+"啟動,其完成動作如下:\n");

   System.out.println("機制於"+new SimpleDateFormat("yyyy/MM/dd HH:mm:ss").format(new Date())+"啟動,其完成動作如下:");

   

   //int countermax = Integer.parseInt(args[0]); // 限制條件

  

    

   // 上線條件要改

   sqlCommand = "SELECT autonumber, keyword, counter,apstart, searchdate_y"+

       " FROM info_keylog"+

       " where "+

       " rownum< 41 "+

       " and apstart = 1 "+

       " ORDER BY searchdate_y ASC";

   

   queryInfoDB = new E104Conn(GlobalXml.getGlobalTagValue("dsn1.driver"),GlobalXml.getGlobalTagValue("dsn1.database"),GlobalXml.getGlobalTagValue("dsn1.username"),GlobalXml.getGlobalTagValue("dsn1.password"),false);

   

   System.out.println("連結INFO資料庫成功...");

   logmail.appendBody("連結INFO資料庫成功...\n");

   //寫檔內容

   pw.write("連結INFO資料庫成功...\n");

   

   // 撈取資料

   System.out.println(sqlCommand);

   logmail.appendBody("sqlCommand:\n"+sqlCommand+"\n");

   pw.write("sqlCommand:\n"+sqlCommand+"\n");

   

   queryInfoDB.setSql(sqlCommand);

   dataInfo = queryInfoDB.getData();

   

   // 判斷跑幾筆

   int total=dataInfo.getRowCount();

    if (total > 40)

     total =40;

    

   System.out.println("共"+total+"筆\n資料如下(autonumber,keyword,counter searchdate_y)");

   logmail.appendBody("共"+total+"筆\n資料如下(autonumber,keyword,counter  searchdate_y)\n");

   pw.write("共"+String.valueOf(total)+"筆\n資料如下:\nautonumber     keyword               counter  searchdate_y\n");

   

   // 顯示資料庫中的欄位值

   if(dataInfo.getRowCount()!= 0)

   for (int i=0;i
   {

    System.out.print(dataInfo.getCell(i,"autonumber")+"  ");

    System.out.print(dataInfo.getCell(i,"keyword")+"  ");

    System.out.print(dataInfo.getCell(i,"counter")+"  ");  // 取欄位值

    System.out.println(dataInfo.getCell(i,"searchdate_y"));  // 取欄位值

    

    logmail.appendBody(dataInfo.getCell(i,"autonumber")+"  ");

    logmail.appendBody(dataInfo.getCell(i,"keyword")+"  ");

    logmail.appendBody(dataInfo.getCell(i,"counter")+"   ");

    logmail.appendBody(dataInfo.getCell(i,"searchdate_y")+"\n");  // 取欄位值

    

    pw.write(dataInfo.getCell(i,"autonumber")+"             ");

    pw.write(dataInfo.getCell(i,"keyword")+"                    ");

    pw.write(dataInfo.getCell(i,"counter")+"  ");

    pw.write(dataInfo.getCell(i,"searchdate_y")+"\n");  // 取欄位值

    

   }

   

   System.out.println("開始取得網頁內容:");

   logmail.appendBody("開始取得網頁內容:\n");

      pw.write("開始取得網頁內容:\n");

     

   

     

      //System.out.println("proxySet:"+System.getProperties().getProperty("proxySet"));

      //System.out.println("proxyHost:"+System.getProperties().getProperty("proxyHost"));

      //System.out.println("proxyPort:"+System.getProperties().getProperty("proxyPort"));

     

      // yahoo連接網頁

   urlyahoo yahoo = new urlyahoo();

   urlyahoo.proxyHost = LocalXml.getLocalTagValue("proxyHost");

   urlyahoo.proxyPort = LocalXml.getLocalTagValue("proxyPort");

   

   String keyword="";

   

   yahoohtmlparse yahoohp = new yahoohtmlparse();

   

   Charcode code = new Charcode();

   // System.out.println("音樂:"+code.Utf8URLencode("音樂"));

   String wkeyword = "";

   int wcounter = 0;

   int run = 0;

   String search="";

   for (int i=0;i
   {

    yahoohp.init();

    

    

    for (int j=0; j<5;j++)  // 跑幾頁

    {

     int page=j*10+1;

     run = i;     

       

    keyword = dataInfo.getCell(i,"keyword");

    wcounter = Integer.parseInt(dataInfo.getCell(i,"counter"));

    wkeyword = keyword;

    

    keyword = code.Utf8URLencode(keyword);

    

    System.out.println("keyword:"+dataInfo.getCell(i,"keyword")+"-"+keyword);

    logmail.appendBody("keyword:"+dataInfo.getCell(i,"keyword")+"-"+keyword+"\n");

    pw.write("keyword:"+dataInfo.getCell(i,"keyword")+"-"+keyword+"\n");

    

    search="http://tw.search.yahoo.com/search?p="+keyword+"&ei=UTF-8&b="+page;

    

    System.out.println(search);

    logmail.appendBody(search+"\n");

    pw.write(search+"\n");

    

    yahoo.connects(search);

       yahoo.readContents();

      

       System.out.println("yahoo.wordcount:"+yahoo.wordcount);

       logmail.appendBody("yahoo.wordcount:"+yahoo.wordcount+"\n");

       pw.write("yahoo.wordcount:"+yahoo.wordcount+"\n");

      

            

       System.out.println("開始搜尋第"+(j+1)+"頁");

       logmail.appendBody("開始搜尋第"+(j+1)+"頁\n");

       pw.write("開始搜尋第"+(j+1)+"頁\n");

    

       //System.out.println("yahoo.htmlstr:"+yahoo.htmlstr);

       yahoohp.html(yahoo.htmlstr);

      

       System.out.println("error:"+yahoohp.error);

       logmail.appendBody("error:"+yahoohp.error+"\n");

       pw.write("error:"+yahoohp.error+"\n");

      

        

    }

    

    

    //System.out.println("<----------找到的資料---------->");

    //logmail.appendBody("<----------找到的資料---------->\n");

    //pw.write("<----------找到的資料---------->\n");

    /*

       for (int k=0;k
       {

        System.out.println("links["+k+"]:"+yahoohp.links[k]);

        System.out.println("titles["+k+"]:"+yahoohp.titles[k]);

        System.out.println("articles["+k+"]:"+yahoohp.articles[k]);

        

        logmail.appendBody("links["+k+"]:"+yahoohp.links[k]+"\n");

        logmail.appendBody("titles["+k+"]:"+yahoohp.titles[k]+"\n");

        logmail.appendBody("articles["+k+"]:"+yahoohp.articles[k]+"\n");

        

        pw.write("links["+k+"]:"+yahoohp.links[k]+"\n");

        pw.write("titles["+k+"]:"+yahoohp.titles[k]+"\n");

        pw.write("articles["+k+"]:"+yahoohp.articles[k]+"\n");

        

       }

    */

    // 每頁多少筆

    int kp =0;

       for (int k=0;k
       {

        kp = k + 1;

        //System.out.println("<----------第["+kp+"]頁---------->");

        

        //logmail.appendBody("<----------第["+kp+"]頁---------->\n");

        

        //pw.write("<----------第["+kp+"]頁---------->\n");

        

       }

       // System.out.println(yahoo.htmlstr);

    

    // 判斷所需字串

    int count=0;

       for (int z=0;z<100;z++) // 設定陣列初值

       {

        clinks[z]="";

        ctitles[z]="";

        carticles[z]="";

       }

    

    // 排除重覆的資料

      for (int x=0;x
      {

       for (int y=0;y
       {

        if (clinks[y].equals("")) // clinks[y]為""時

        {

         clinks[y]=yahoohp.links[x];

         ctitles[y]=yahoohp.titles[x];

         carticles[y]=yahoohp.articles[x];

         count=count+1;

  

         //System.out.println("clinks["+y+"]"+clinks[y]);

         //logmail.appendBody("clinks["+y+"]"+clinks[y]+"\n");

         //pw.write("clinks["+y+"]"+clinks[y]+"\n");

         

         //System.out.println("ctitles["+y+"]"+ctitles[y]);

         //logmail.appendBody("ctitles["+y+"]"+ctitles[y]+"\n");

         //pw.write("ctitles["+y+"]"+ctitles[y]+"\n");

         

         //System.out.println("carticles["+y+"]"+carticles[y]);

         //logmail.appendBody("carticles["+y+"]"+carticles[y]+"\n");

         //pw.write("carticles["+y+"]"+carticles[y]+"\n");

         

         break;

        }

        else // clinks[y]有值時

        {

         if (yahoohp.links[x].equals(clinks[y])) // 一樣

         {

          break;

         }

         else // 不一樣

         {

          continue;

         }

        }

       }

      }

    System.out.println("count="+count);

    logmail.appendBody("count="+count+"\n");

    pw.write("count="+count+"\n");

    

    System.out.println("<----------所需的資料---------->");

    logmail.appendBody("<----------所需的資料---------->"+"\n");

    pw.write("<----------所需的資料---------->"+"\n");

    

    int sum = count;

    int to = 0;

    // 設定資料庫連結

    wqueryInfoDB = new E104Conn(GlobalXml.getGlobalTagValue("dsn1.driver"),GlobalXml.getGlobalTagValue("dsn1.database"),GlobalXml.getGlobalTagValue("dsn1.username"),GlobalXml.getGlobalTagValue("dsn1.password"),false);


    

    

    

    // 資料處理

     for (int p=0;p<5;p++) // 共5頁

     {

      int pcount =0;      // 計算頁數值

      int change =0;      // 計算目前筆數

      pcount=p+1;

      

      if (sum == -1)  // sum>=0代表還有資料,-1表結束

       break;

      if (sum > 0)

      {

       sum = sum - yahoohp.page[p];

       if (sum <= 0)

        sum = 0;

      }

      

      System.out.println("第"+pcount+"頁");

      logmail.appendBody("第"+pcount+"頁\n");

      pw.write("第"+pcount+"頁\n");

        

      

      System.out.println("sum="+sum);

      logmail.appendBody("sum="+sum+"\n");

      pw.write("sum="+sum+"\n");

      

      

       if (sum >=0 || sum != -1)

       {

        for (int c=0;c
        {

            if (to == count)

            {

             System.out.println("結束");

             logmail.appendBody("結束\n");

             pw.write("結束\n");

             break;

            }

          

         int slot = c+1;      // 資料庫排序值

         System.out.println("筆數"+change);

         logmail.appendBody("筆數"+change+"\n");

         pw.write("筆數"+change+"\n");

            if (change == yahoohp.page[p])  // 記算筆數是否為頁筆數

            { 

             System.out.println("換頁");

             logmail.appendBody("換頁\n");

             pw.write("換頁\n");

             break;

            }

         

         System.out.println("clinks["+to+"]="+clinks[to]);

         System.out.println("ctitles["+to+"]="+ctitles[to]);

         System.out.println("carticles["+to+"]="+carticles[to]);

         

         logmail.appendBody("\nclinks["+to+"]="+clinks[to]);

         logmail.appendBody("\nctitles["+to+"]="+ctitles[to]);

         logmail.appendBody("\ncarticles["+to+"]="+carticles[to]);

         

         pw.write("\nclinks["+to+"]="+clinks[to]);

         pw.write("\nctitles["+to+"]="+ctitles[to]);

         pw.write("\ncarticles["+to+"]="+carticles[to]);

         

         

         

         

         // 寫入資料庫

         // max=max+1;

         /*

         System.out.println("max:"+max);

         System.out.println("domain_name:"+clinks[to]);

         System.out.println("keyword:"+wkeyword);

         System.out.println("pagenum:"+pcount);

         System.out.println("slot:"+slot);

         System.out.println("adtitle:"+ctitles[to]);

         System.out.println("adtext:"+carticles[to]);

         System.out.println("engine:y");

         System.out.println("create_date:syadate");

         System.out.println("counter:"+wcounter);

         */

         

         

         //判斷資料是否存在,若存在用更新的方法…

         swsqlCommand = "SELECT auto_no, keyword,domain_name "+

          " FROM over_monitor "+

          " where "+

          " keyword ='"+wkeyword+"' "+

          " and domain_name ='"+clinks[to]+"' "+

          " and engine = 'y'";

          //" ORDER BY auto_no desc";

      

         swqueryInfoDB = new E104Conn(GlobalXml.getGlobalTagValue("dsn1.driver"),GlobalXml.getGlobalTagValue("dsn1.database"),GlobalXml.getGlobalTagValue("dsn1.username"),GlobalXml.getGlobalTagValue("dsn1.password"),false);

         swqueryInfoDB.setSql(swsqlCommand);

         swdataInfo = swqueryInfoDB.getData();

         

         System.out.println("存在筆數:"+swdataInfo.getRowCount());

         

          if (swdataInfo.getRowCount() ==0)

          {

           

           wsqlCommand =

           "insert into over_monitor "+

           " (auto_no,domain_name,keyword,pagenum,slot,adtitle,adtext,engine,create_date,counter)"+

           " values( seq_over_monitor.nextval  ,'"+clinks[to]+"' ,'"+wkeyword+"' ,"+pcount+" ,"+slot+" ,'"+ctitles[to]+"' ,'"+carticles[to]+"' ,'y' ,sysdate,"+wcounter+")";

           

           //撈取資料

           System.out.println("keyword:"+wkeyword);

           logmail.appendBody("\nkeyword:"+wkeyword+"\n");

           pw.write("\nkeyword:"+wkeyword+"\n");

           

           System.out.println("write:"+wsqlCommand);

           logmail.appendBody("write:\n"+wsqlCommand+"\n");

           pw.write("write:\n"+wsqlCommand+"\n");

           

           wqueryInfoDB.setSql(wsqlCommand);

           

           wdataInfo = wqueryInfoDB.getData();

          }

         

         

         /*

         else

         {

          wsqlCommand =

           "update over_monitor"+

           " set pagenum="+pcount+", "+

           " slot ="+slot+", "+

           " adtitle='"+ctitles[to]+"', "+

           " adtext='"+carticles[to]+"', "+

           " engine='y' ,"+

           " create_date=sysdate ,"+

           " counter="+wcounter+

           " where auto_no="+swdataInfo.getCell(0,"auto_no");

         }

         */

         to=to+1;

         

         change=change+1;

     

        }

        

         if (sum == 0)

         {

          sum=-1;

          wsqlCommand =

           "update info_keylog "+

           " set searchdate_y =sysdate "+

           " where autonumber="+dataInfo.getCell(i,"autonumber");

          

          System.out.println("info_keylog:"+wsqlCommand);

          logmail.appendBody("info_keylog:\n"+wsqlCommand+"\n");

          pw.write("info_keylog:\n"+wsqlCommand+"\n");

          wqueryInfoDB.setSql(wsqlCommand); 

          wdataInfo = wqueryInfoDB.getData();

         }

        

        

       }

     } 

    pw.flush();

    System.out.println("<----------初始設定---------->");

    logmail.appendBody("<----------初始設定---------->\n");

    pw.write("<----------初始設定---------->\n");

    count =0;

    yahoohp.init();

    System.out.println("error:"+yahoohp.error);

    logmail.appendBody("error:"+yahoohp.error+"\n");

    pw.write("error:"+yahoohp.error+"\n");

    

   }

    // out.write("htmlstr:"+hp.htmlstr);

  }

  catch (Exception e)

  {

         

    System.out.println(e.getMessage()); 

    logmail.appendBody(e.getMessage());

    pw.write(e.getMessage());

    

       System.out.println("系統有誤未完成,請通知INFO的SA!!");

       System.out.println("Message="+e.getMessage());

       System.out.println("Exception="+e.toString());

      

       logmail.appendBody("系統有誤未完成,請通知INFO的SA!!\n");

       logmail.appendBody("Message="+e.getMessage()+"\n");

       logmail.appendBody("Exception="+e.toString()+"\n");

      

    pw.write("系統有誤未完成,請通知INFO的SA!!\n");

    pw.write("Message="+e.getMessage()+"\n");

    pw.write("Exception="+e.toString()+"\n");

    pw.flush();

  }

  finally

  {

   //關閉連線,釋放資源

   System.out.println("釋放所有資源!!");

   logmail.appendBody("釋放所有資源!!\n");

   pw.write("釋放所有資源!!\n");

   

   System.out.println("機制於"+new SimpleDateFormat("yyyy/MM/dd HH:mm:ss").format(new Date())+"結束");

   logmail.appendBody("機制於"+new SimpleDateFormat("yyyy/MM/dd HH:mm:ss").format(new Date())+"結束\n");

   pw.write("機制於"+new SimpleDateFormat("yyyy/MM/dd HH:mm:ss").format(new Date()).toString()+"結束\n");

   

   //寄信

   logmail.send();

   

   dataInfo = null;

      logmail = null;


   log = null;

   sqlCommand = null;

   wsqlCommand = null;

   

   

   queryInfoDB.close();

   wqueryInfoDB.close();

   pw.flush();

   pw.close();

   fw.close();

   

   

  }

  

   

 }

 


}


//big5轉url編碼

class Charcode {


   /**

    * 轉換編碼 ISO-8859-1到GB2312

    * @param text

    * @return

    */

   public String ISO2GB(String text) {

     String result = "";

     try {

       result = new String(text.getBytes("ISO-8859-1"), "GB2312");

     }

     catch (UnsupportedEncodingException ex) {

       result = ex.toString();

     }

     return result;

   }


   /**

    * 轉換編碼 GB2312到ISO-8859-1

    * @param text

    * @return

    */

   public String GB2ISO(String text) {

     String result = "";

     try {

       result = new String(text.getBytes("GB2312"), "ISO-8859-1");

     }

     catch (UnsupportedEncodingException ex) {

       ex.printStackTrace();

     }

     return result;

   }

   /**

    * Utf8URL編碼

    * @param s

    * @return

    */

   public String Utf8URLencode(String text) {

     StringBuffer result = new StringBuffer();


     for (int i = 0; i < text.length(); i++) {


       char c = text.charAt(i);

       if (c >= 0 && c <= 255) {

         result.append(c);

       }else {


         byte[] b = new byte[0];

         try {

           b = Character.toString(c).getBytes("UTF-8");

         }catch (Exception ex) {

         }


         for (int j = 0; j < b.length; j++) {

           int k = b[j];

           if (k < 0) k += 256;

           result.append("%" + Integer.toHexString(k).toUpperCase());

         }


       }

     }


     return result.toString();

   }


   /**

    * Utf8URL解碼

    * @param text

    * @return

    */

   public String Utf8URLdecode(String text) {

     String result = "";

     int p = 0;


     if (text!=null && text.length()>0){

       text = text.toLowerCase();

       p = text.indexOf("%e");

       if (p == -1) return text;


       while (p != -1) {

         result += text.substring(0, p);

         text = text.substring(p, text.length());

         if (text == "" || text.length() < 9) return result;


         result += CodeToWord(text.substring(0, 9));

         text = text.substring(9, text.length());

         p = text.indexOf("%e");

       }


     }


     return result + text;

   }


   /**

    * utf8URL編碼轉字符

    * @param text

    * @return

    */

   private String CodeToWord(String text) {

     String result;


     if (Utf8codeCheck(text)) {

       byte[] code = new byte[3];

       code[0] = (byte) (Integer.parseInt(text.substring(1, 3), 16) - 256);

       code[1] = (byte) (Integer.parseInt(text.substring(4, 6), 16) - 256);

       code[2] = (byte) (Integer.parseInt(text.substring(7, 9), 16) - 256);

       try {

         result = new String(code, "UTF-8");

       }catch (UnsupportedEncodingException ex) {

         result = null;

       }

     }

     else {

       result = text;

     }


     return result;

   }


   /**

    * 編碼是否有效

    * @param text

    * @return

    */

   private boolean Utf8codeCheck(String text){

     String sign = "";

     if (text.startsWith("%e"))

       for (int i = 0, p = 0; p != -1; i++) {

         p = text.indexOf("%", p);

         if (p != -1)

           p++;

         sign += p;

       }

     return sign.equals("147-1");

   }


   /**

    * 是否Utf8Url編碼

    * @param text

    * @return

    */

   public boolean isUtf8Url(String text) {

     text = text.toLowerCase();

     int p = text.indexOf("%");

     if (p != -1 && text.length() - p > 9) {

       text = text.substring(p, p + 9);

     }

     return Utf8codeCheck(text);

   }

}


 




//連接yahoo網頁取得內容

class urlyahoo implements Runnable  

{

 String htmlstr="";  //儲存網頁內容

    int wordcount=0;  //計算字數

   

    //URLConnection URLConn;

    //URLConnection URLConn;

    HttpURLConnection http;

    static String proxyHost= null;

    static String proxyPort= null;

 


    public void connects( String urlString ) {

      try {

      

        // 設proxy

       /*

          Properties systemProperties = System.getProperties();

          systemProperties.put("proxySet","true");

          systemProperties.put("http.proxyHost",proxyHost);

          systemProperties.put("http.proxyPort",proxyPort);

          Syst

arrow
arrow
    全站熱搜

    狼翔月影 發表在 痞客邦 留言(0) 人氣()