import java.io.*;

import java.net.HttpURLConnection;

import java.net.MalformedURLException;

import java.net.URL;

import java.net.URLConnection;

import java.text.SimpleDateFormat;

import java.util.*;



import org.jdom.*;

import org.jdom.xpath.*;

import org.jdom.input.*;



import com.e104.util.*;

import com.e104.util.E104Mail;

import com.e104.db.*;



/**

* @author kevin.huang

* @email kevin.huang@104.com.tw

* @description 解析xml檔程式

* @date 2006/9/18

* @description 傳送Email、寫log檔、搜詢資料庫

* @date 2006/9/19

* @description 連接網頁內容取回部份內容值

* @date 2006/9/20

* @description 解析yahoo網頁完成

* @date 2006/9/21

* @description 資料寫入資料庫成功

* @date 2006/9/28

* @description 增加防錯測試

* @date 2006/10/1

*/



public class Yahoo

{



public static void main(String[] args) throws Exception

{

String log = "";



//儲存所需的正確資料(已排除重覆)

String clinks[] = new String[100];

String ctitles[] = new String[100];

String carticles[] = new String[100];



global g = new global();

local l = new local();



//建立資料夾

File myDir = new File("C:\\Documents and Settings\\kevin.huang\\My Documents\\workspace\\testperl\\log");

myDir.mkdir();

System.out.println(myDir+(myDir.isDirectory()?" is":" is not")+" a directory.");

//建檔

String dirName = "C:\\Documents and Settings\\kevin.huang\\My Documents\\workspace\\testperl\\log";

String fileName = "yahoo_" + new SimpleDateFormat("yyyyMMddHHmmss").format(new Date()) + ".log";

File output = new File (dirName,fileName);

output.createNewFile();

//System.out.println(output.getPath());

FileWriter fw = new FileWriter(output.getPath(),true);

PrintWriter pw = new PrintWriter(fw);







E104Mail logmail = new E104Mail(); //定義寄送mail物件



//資料庫初始設定

//搜尋時用

E104Conn queryInfoDB = null; // 建立會用到的 connection

E104Data dataInfo = null; // 建立會用到的 query容器

String sqlCommand="";



//寫入時用

E104Conn wqueryInfoDB = null; // 建立會用到的 connection

E104Data wdataInfo = null; // 建立會用到的 query容器

String wsqlCommand="";



//找最大值

E104Conn fqueryInfoDB = null; // 建立會用到的 connection

E104Data fdataInfo = null; // 建立會用到的 query容器

String fsqlCommand="";



try

{

//xml解析

log = g.startparse();

l.startparse();

System.out.println("logpath="+log);



//寫檔內容

pw.append("機制於"+new SimpleDateFormat("yyyy/MM/dd HH:mm:ss").format(new Date())+"啟動,其完成動作如下:\n");



//初始化郵件



l.MailHost="ex01.e104.com.tw"; //設定SMTP



logmail.setHost(l.MailHost);

logmail.setFrom(l.MailFrom);

logmail.setTo(l.MailTo);

//logmail.setBcc(l.MailCc);

logmail.setSubject("test yahoo search keyword...");



logmail.appendBody("機制於"+new SimpleDateFormat("yyyy/MM/dd HH:mm:ss").format(new Date())+"啟動,其完成動作如下:\n");



int countermax = Integer.parseInt(args[0]); //限制條件

//上線條件要改

sqlCommand = "SELECT autonumber, keyword, counter,apstart"+

" FROM info_keylog"+

" where "+

//" WHERE (keyword_group IS NOT NULL)"+

//" and "+

//" keyword_group <> '104boss'"+

//" counter < "+countermax+

//" and rownum< 41 "+

" apstart = 1"+

" ORDER BY counter DESC";



queryInfoDB = new E104Conn(g.driver,g.database,g.username,g.password,false);



System.out.println("連結INFO資料庫成功...");

logmail.appendBody("連結INFO資料庫成功...\n");

//寫檔內容

pw.append("連結INFO資料庫成功...\n");



//撈取資料

System.out.println(sqlCommand);

logmail.appendBody("sqlCommand:\n"+sqlCommand+"\n");

pw.append("sqlCommand:\n"+sqlCommand+"\n");



queryInfoDB.setSql(sqlCommand);

dataInfo = queryInfoDB.getData();



System.out.println("共"+dataInfo.getRowCount()+"筆\n資料如下(autonumber,keyword,counter)");

logmail.appendBody("共"+dataInfo.getRowCount()+"筆\n資料如下(autonumber,keyword,counter)\n");

pw.append("共"+dataInfo.getRowCount()+"筆\n資料如下:\nautonumber keyword counter\n");



//顯示資料庫中的欄位值

if(dataInfo.getRowCount()!= 0)

for (int i=0;i
{

System.out.print(dataInfo.getCell(i,"autonumber")+" ");

System.out.print(dataInfo.getCell(i,"keyword")+" ");

System.out.println(dataInfo.getCell(i,"counter")); //取欄位值



logmail.appendBody(dataInfo.getCell(i,"autonumber")+" ");

logmail.appendBody(dataInfo.getCell(i,"keyword")+" ");

logmail.appendBody(dataInfo.getCell(i,"counter")+"\n");



pw.append(dataInfo.getCell(i,"autonumber")+" ");

pw.append(dataInfo.getCell(i,"keyword")+" ");

pw.append(dataInfo.getCell(i,"counter")+"\n");



}



System.out.println("開始取得網頁內容:");

logmail.appendBody("開始取得網頁內容:\n");

pw.append("開始取得網頁內容:\n");



//設proxy

System.getProperties().put("proxySet","true");

System.getProperties().put("proxyHost","http://proxy.hinet.net");

System.getProperties().put("proxyPort","80");

System.out.println("proxySet:"+System.getProperties().getProperty("proxySet"));

System.out.println("proxyHost:"+System.getProperties().getProperty("proxyHost"));

System.out.println("proxyPort:"+System.getProperties().getProperty("proxyPort"));



//yahoo連接網頁

urlyahoo yahoo = new urlyahoo();

String keyword="";



yahoohtmlparse yahoohp = new yahoohtmlparse();



Charcode code = new Charcode();

//System.out.println("音樂:"+code.Utf8URLencode("音樂"));

String wkeyword = "";

int wcounter = 0;

//判斷跑幾筆

int total=dataInfo.getRowCount();

if (total > 40)

total =40;



for (int i=0;i
{

yahoohp.init();

//keyword = dataInfo.getCell(i,"keyword");

for (int j=0; j<5;j++) //跑幾頁

{

int page=j*10+1;

keyword = dataInfo.getCell(i,"keyword");

wcounter = Integer.parseInt(dataInfo.getCell(i,"counter"));

wkeyword = keyword;



keyword = code.Utf8URLencode(keyword);



System.out.println("keyword:"+dataInfo.getCell(i,"keyword")+"-"+keyword);

logmail.appendBody("keyword:"+dataInfo.getCell(i,"keyword")+"-"+keyword+"\n");

pw.append("keyword:"+dataInfo.getCell(i,"keyword")+"-"+keyword+"\n");



keyword="http://tw.search.yahoo.com/search?p="+keyword+"&ei=UTF-8&b="+page;



System.out.println("keyword:"+keyword);

logmail.appendBody("keyword:"+keyword+"\n");

pw.append("keyword:"+keyword+"\n");

yahoo.connect(keyword);

yahoo.readContents();



System.out.println("yahoo.wordcount:"+yahoo.wordcount);

logmail.appendBody("yahoo.wordcount:"+yahoo.wordcount+"\n");

pw.append("yahoo.wordcount:"+yahoo.wordcount+"\n");



if (yahoo.wordcount == 0)

{

System.out.println("<----------開始重新連線---------->");

logmail.appendBody("<----------開始重新連線---------->\n");

pw.append("<----------開始重新連線---------->\n");



yahoo.connect(keyword);

yahoo.readContents();



}



System.out.println("開始搜尋第"+(j+1)+"頁");

logmail.appendBody("開始搜尋第"+(j+1)+"頁\n");

pw.append("開始搜尋第"+(j+1)+"頁\n");



yahoohp.html(yahoo.htmlstr);



System.out.println("error:"+yahoohp.error);

logmail.appendBody("error:"+yahoohp.error+"\n");

pw.append("error:"+yahoohp.error+"\n");



if (yahoohp.error == 1)

if (yahoo.wordcount == 0)

{

System.out.println("<----------開始重新連線---------->");

logmail.appendBody("<----------開始重新連線---------->\n");

pw.append("<----------開始重新連線---------->\n");



yahoo.connect(keyword);

yahoo.readContents();

yahoohp.html(yahoo.htmlstr);

}

else

{

break;

}



}







System.out.println("<----------找到的資料---------->");

logmail.appendBody("<----------找到的資料---------->\n");

pw.append("<----------找到的資料---------->\n");



for (int k=0;k
{

System.out.println("links["+k+"]:"+yahoohp.links[k]);

System.out.println("titles["+k+"]:"+yahoohp.titles[k]);

System.out.println("articles["+k+"]:"+yahoohp.articles[k]);



logmail.appendBody("links["+k+"]:"+yahoohp.links[k]+"\n");

logmail.appendBody("titles["+k+"]:"+yahoohp.titles[k]+"\n");

logmail.appendBody("articles["+k+"]:"+yahoohp.articles[k]+"\n");



pw.append("links["+k+"]:"+yahoohp.links[k]+"\n");

pw.append("titles["+k+"]:"+yahoohp.titles[k]+"\n");

pw.append("articles["+k+"]:"+yahoohp.articles[k]+"\n");



}

//每頁多少筆

int kp =0;

for (int k=0;k
{

kp = k + 1;

System.out.println("<----------第["+kp+"]頁---------->");



logmail.appendBody("<----------第["+kp+"]頁---------->\n");



pw.append("<----------第["+kp+"]頁---------->\n");



}

//System.out.println(yahoo.htmlstr);



//判斷所需字串

int count=0;

for (int z=0;z<100;z++) //設定陣列初值

{

clinks[z]="";

ctitles[z]="";

carticles[z]="";

}



//排除重覆的資料

for (int x=0;x
{

for (int y=0;y
{

if (clinks[y].equals("")) //clinks[y]為""時

{

clinks[y]=yahoohp.links[x];

ctitles[y]=yahoohp.titles[x];

carticles[y]=yahoohp.articles[x];

count=count+1;



System.out.println("clinks["+y+"]"+clinks[y]);

logmail.appendBody("clinks["+y+"]"+clinks[y]+"\n");

pw.append("clinks["+y+"]"+clinks[y]+"\n");



System.out.println("ctitles["+y+"]"+ctitles[y]);

logmail.appendBody("ctitles["+y+"]"+ctitles[y]+"\n");

pw.append("ctitles["+y+"]"+ctitles[y]+"\n");



System.out.println("carticles["+y+"]"+carticles[y]);

logmail.appendBody("carticles["+y+"]"+carticles[y]+"\n");

pw.append("carticles["+y+"]"+carticles[y]+"\n");



break;

}

else //clinks[y]有值時

{

if (yahoohp.links[x].equals(clinks[y])) //一樣

{

break;

}

else //不一樣

{

continue;

}

}

}

}

System.out.println("count="+count);

logmail.appendBody("count="+count+"\n");

pw.append("count="+count+"\n");



System.out.println("<----------所需的資料---------->");

logmail.appendBody("<----------所需的資料---------->"+"\n");

pw.append("<----------所需的資料---------->"+"\n");



int sum = count;

int to = 0;

//設定資料庫連結

wqueryInfoDB = new E104Conn(g.driver,g.database,g.username,g.password,false);

fqueryInfoDB = new E104Conn(g.driver,g.database,g.username,g.password,false);



System.out.println("連結INFO資料庫成功...");

logmail.appendBody("連結INFO資料庫成功...\n");

pw.append("連結INFO資料庫成功...\n");



//資料處理

for (int p=0;p<5;p++) //共5頁

{

int pcount =0; //計算頁數值

int change =0; //計算目前筆數

pcount=p+1;



System.out.println("第"+pcount+"頁");

logmail.appendBody("第"+pcount+"頁\n");

pw.append("第"+pcount+"頁\n");

if (sum == -1) //sum>=0代表還有資料,-1表結束

break;

if (sum > 0)

{

sum = sum - yahoohp.page[p];

if (sum <= 0)

sum = 0;

}



System.out.println("sum="+sum);

logmail.appendBody("sum="+sum+"\n");

pw.append("sum="+sum+"\n");





if (sum >=0 || sum != -1)

{

for (int c=0;c
{

if (to == count)

{

System.out.println("結束");

logmail.appendBody("結束\n");

pw.append("結束\n");

break;

}



int slot = c+1; //資料庫排序值

System.out.println("筆數"+change);

logmail.appendBody("筆數"+change+"\n");

pw.append("筆數"+change+"\n");

if (change == yahoohp.page[p]) //記算筆數是否為頁筆數

{

System.out.println("換頁");

logmail.appendBody("換頁\n");

pw.append("換頁\n");

break;

}



System.out.println("clinks["+to+"]="+clinks[to]);

System.out.println("ctitles["+to+"]="+ctitles[to]);

System.out.println("carticles["+to+"]="+carticles[to]);



logmail.appendBody("\nclinks["+to+"]="+clinks[to]);

logmail.appendBody("\nctitles["+to+"]="+ctitles[to]);

logmail.appendBody("\ncarticles["+to+"]="+carticles[to]);



pw.append("\nclinks["+to+"]="+clinks[to]);

pw.append("\nctitles["+to+"]="+ctitles[to]);

pw.append("\ncarticles["+to+"]="+carticles[to]);





//找最大流水號

fsqlCommand =

"select max(auto_no) as mx from over_monitor";



System.out.println("Find:"+fsqlCommand);

//logmail.appendBody("Find:\n"+fsqlCommand+"\n");

//pw.append("Find:\n"+fsqlCommand+"\n");



fqueryInfoDB.setSql(fsqlCommand);

fdataInfo = fqueryInfoDB.getData();

//System.out.println(fdataInfo.getColumnValues(0));



System.out.println("auto_no值:"+fdataInfo.getColumnValues(0));

int max =Integer.parseInt(fdataInfo.getCell(0,"mx"))+1;



System.out.println("max:"+max);

logmail.appendBody("\nmax:"+max);

pw.append("\nmax:"+max);



//寫入資料庫

//max=max+1;

System.out.println("max:"+max);

System.out.println("domain_name:"+clinks[to]);

System.out.println("keyword:"+wkeyword);

System.out.println("pagenum:"+pcount);

System.out.println("slot:"+slot);

System.out.println("adtitle:"+ctitles[to]);

System.out.println("adtext:"+carticles[to]);

System.out.println("engine:y");

System.out.println("create_date:syadate");

System.out.println("counter:"+wcounter);



wsqlCommand =

"insert into over_monitor "+

" (auto_no,domain_name,keyword,pagenum,slot,adtitle,adtext,engine,create_date,counter)"+

" values("+max+" ,'"+clinks[to]+"' ,'"+wkeyword+"' ,"+pcount+" ,"+slot+" ,'"+ctitles[to]+"' ,'"+carticles[to]+"' ,'y' ,sysdate,"+wcounter+")";

to=to+1;

//撈取資料

System.out.println("keyword:"+wkeyword);

logmail.appendBody("\nkeyword:"+wkeyword+"\n");

pw.append("\nkeyword:"+wkeyword+"\n");



System.out.println("write:"+wsqlCommand);

logmail.appendBody("write:\n"+wsqlCommand+"\n");

pw.append("write:\n"+wsqlCommand+"\n");

change=change+1;

//System.out.println("begin");

wqueryInfoDB.setSql(wsqlCommand);

//wqueryInfoDB.executeUpdate(wsqlCommand);

//System.out.println("setsql");

wdataInfo = wqueryInfoDB.getData();

//System.out.println("wdatainfo");

//System.out.println("共"+wdataInfo.getRowCount()+"筆");





}

if (sum == 0)

sum=-1;

}

}

System.out.println("<----------初始設定---------->");

logmail.appendBody("<----------初始設定---------->\n");

pw.append("<----------初始設定---------->\n");

count =0;

yahoohp.init();

System.out.println("error:"+yahoohp.error);

logmail.appendBody("error:"+yahoohp.error+"\n");

pw.append("error:"+yahoohp.error+"\n");

}

//out.write("htmlstr:"+hp.htmlstr);

}

catch (Exception e)

{



System.out.println(e.getMessage());

logmail.appendBody(e.getMessage());

pw.append(e.getMessage());



System.out.println("系統有誤未完成,請通知INFO的SA!!");

System.out.println("Message="+e.getMessage());

System.out.println("Exception="+e.toString());



logmail.appendBody("系統有誤未完成,請通知INFO的SA!!\n");

logmail.appendBody("Message="+e.getMessage()+"\n");

logmail.appendBody("Exception="+e.toString()+"\n");



pw.append("系統有誤未完成,請通知INFO的SA!!\n");

pw.append("Message="+e.getMessage()+"\n");

pw.append("Exception="+e.toString()+"\n");



}

finally

{

//關閉連線,釋放資源

System.out.println("釋放所有資源!!");

logmail.appendBody("釋放所有資源!!\n");

pw.append("釋放所有資源!!\n");



System.out.println("機制於"+new SimpleDateFormat("yyyy/MM/dd HH:mm:ss").format(new Date())+"結束");

logmail.appendBody("機制於"+new SimpleDateFormat("yyyy/MM/dd HH:mm:ss").format(new Date())+"結束\n");

pw.append("機制於"+new SimpleDateFormat("yyyy/MM/dd HH:mm:ss").format(new Date())+"結束\n");



//寄信

logmail.send();



dataInfo = null;

logmail = null;

g = null;

l = null;

log = null;

sqlCommand = null;

wsqlCommand = null;





queryInfoDB.close();

//wqueryInfoDB.close();

//fqueryInfoDB.close();

pw.close();

fw.close();





}





}





}



//big5轉url編碼

class Charcode {



/**

* 轉換編碼 ISO-8859-1到GB2312

* @param text

* @return

*/

public String ISO2GB(String text) {

String result = "";

try {

result = new String(text.getBytes("ISO-8859-1"), "GB2312");

}

catch (UnsupportedEncodingException ex) {

result = ex.toString();

}

return result;

}



/**

* 轉換編碼 GB2312到ISO-8859-1

* @param text

* @return

*/

public String GB2ISO(String text) {

String result = "";

try {

result = new String(text.getBytes("GB2312"), "ISO-8859-1");

}

catch (UnsupportedEncodingException ex) {

ex.printStackTrace();

}

return result;

}

/**

* Utf8URL編碼

* @param s

* @return

*/

public String Utf8URLencode(String text) {

StringBuffer result = new StringBuffer();



for (int i = 0; i < text.length(); i++) {



char c = text.charAt(i);

if (c >= 0 && c <= 255) {

result.append(c);

}else {



byte[] b = new byte[0];

try {

b = Character.toString(c).getBytes("UTF-8");

}catch (Exception ex) {

}



for (int j = 0; j < b.length; j++) {

int k = b[j];

if (k < 0) k += 256;

result.append("%" + Integer.toHexString(k).toUpperCase());

}



}

}



return result.toString();

}



/**

* Utf8URL解碼

* @param text

* @return

*/

public String Utf8URLdecode(String text) {

String result = "";

int p = 0;



if (text!=null && text.length()>0){

text = text.toLowerCase();

p = text.indexOf("%e");

if (p == -1) return text;



while (p != -1) {

result += text.substring(0, p);

text = text.substring(p, text.length());

if (text == "" || text.length() < 9) return result;



result += CodeToWord(text.substring(0, 9));

text = text.substring(9, text.length());

p = text.indexOf("%e");

}



}



return result + text;

}



/**

* utf8URL編碼轉字符

* @param text

* @return

*/

private String CodeToWord(String text) {

String result;



if (Utf8codeCheck(text)) {

byte[] code = new byte[3];

code[0] = (byte) (Integer.parseInt(text.substring(1, 3), 16) - 256);

code[1] = (byte) (Integer.parseInt(text.substring(4, 6), 16) - 256);

code[2] = (byte) (Integer.parseInt(text.substring(7, 9), 16) - 256);

try {

result = new String(code, "UTF-8");

}catch (UnsupportedEncodingException ex) {

result = null;

}

}

else {

result = text;

}



return result;

}



/**

* 編碼是否有效

* @param text

* @return

*/

private boolean Utf8codeCheck(String text){

String sign = "";

if (text.startsWith("%e"))

for (int i = 0, p = 0; p != -1; i++) {

p = text.indexOf("%", p);

if (p != -1)

p++;

sign += p;

}

return sign.equals("147-1");

}



/**

* 是否Utf8Url編碼

* @param text

* @return

*/

public boolean isUtf8Url(String text) {

text = text.toLowerCase();

int p = text.indexOf("%");

if (p != -1 && text.length() - p > 9) {

text = text.substring(p, p + 9);

}

return Utf8codeCheck(text);

}

}







class global //處理global.xml

{

// 設定參數

String service="";

String logpath="";

int delay;

String service_des="";

String name="";

String driver="";

String database="";

String username="";

String password="";

int apname;



public String startparse() throws Exception

{

String filename = "global.xml";//xml檔名



SAXBuilder builder = new SAXBuilder();

Document doc = builder.build(new File(filename));//得到Document對像



Element root = doc.getRootElement(); //獲得根元素



XPath xpath = XPath.newInstance("//*"); //解析開始位置

List list = xpath.selectNodes(root); //設定開始節點



Iterator iter = list.iterator();



while (iter.hasNext()) {

Element item = (Element) iter.next();

//System.out.print(item);

System.out.print(item.getName()+":");



String tname =item.getName();

String vname =item.getText();



//判斷節點值

if (tname.equals("ap")) //取ap的name屬性值

{

System.out.print("name="+item.getAttributeValue("name")+" ");

String chk=item.getAttributeValue("name");

apname = Integer.parseInt(chk); //String轉int

}



if (tname.equals("service"))

service =vname;

if (tname.equals("logpath"))

{

logpath =vname;

System.out.println(logpath);

}

if (tname.equals("delay"))

{

tname =item.getText();

delay =Integer.parseInt(tname);

}

if (tname.equals("service_des"))

service_des =vname;

if (tname.equals("name"))

name =vname;

if (tname.equals("driver"))

driver =vname;

if (tname.equals("database"))

database =vname;

if (tname.equals("username"))

username =vname;

if (tname.equals("password"))

password =vname;



//System.out.println(item.getText());

//System.err.println(item.getText());

}

return logpath;

}

}



class local //處理local.xml

{

// 設定參數

String GlobalPtah="";

String EdmPathFree="";

String EdmPathPay="";

String ServerPath="";

String MailHost="";

String MailFrom="";

String MailTo="";

String MailCc="";

int Sql;



public void startparse() throws Exception

{

String filename = "local.xml";//xml檔名



SAXBuilder builder = new SAXBuilder();

Document doc = builder.build(new File(filename));//得到Document對像



Element root = doc.getRootElement(); //獲得根元素



XPath xpath = XPath.newInstance("//*"); //解析開始位置

List list = xpath.selectNodes(root); //設定開始節點



Iterator iter = list.iterator();



while (iter.hasNext()) {

Element item = (Element) iter.next();

//System.out.print(item);

System.out.print(item.getName()+":");



String tname =item.getName();

String vname =item.getText();



//判斷節點值

if (tname.equals("GlobalPtah"))

GlobalPtah =vname;

if (tname.equals("EdmPathFree"))

EdmPathFree =vname;

if (tname.equals("ServerPath"))

ServerPath =vname;

if (tname.equals("MailHost"))

MailHost =vname;

if (tname.equals("MailFrom"))

MailFrom =vname;

if (tname.equals("MailTo"))

MailTo =vname;

if (tname.equals("MailTo"))

MailTo =vname;

if (tname.equals("Sql"))

{

tname =item.getText();

Sql =Integer.parseInt(tname);

}



System.out.println(item.getText());

//System.err.println(item.getText());

}

}

}



//連接yahoo網頁取得內容

class urlyahoo implements Runnable

{

String htmlstr=""; //儲存網頁內容

int wordcount=0; //計算字數



//URLConnection URLConn;

private static URLConnection connection;



public void connect( String urlString ) {

try {

URL url = new URL(urlString);

connection = url.openConnection();

connection.setConnectTimeout(3000);

connection.setReadTimeout(3000);



} catch (MalformedURLException e){

e.printStackTrace();

} catch (IOException e) {

e.printStackTrace();

} catch (Exception e){

e.printStackTrace();

}

}



public void readContents() {

BufferedReader in = null;



try {

in = new BufferedReader(

new InputStreamReader(

connection.getInputStream()));





String inputLine;

htmlstr="";

while (

(inputLine = in.readLine()) != null) {

//System.out.println(inputLine);

htmlstr=htmlstr+inputLine;

wordcount = wordcount +inputLine.length();

}

Thread.sleep(5000);





//System.out.println("wordcount:"+wordcount);

//System.out.println("htmlstr:\n"+htmlstr);

//System.out.println("htmlstrcount:"+htmlstr.length());

} catch (IOException e) {

e.printStackTrace();

} catch (Exception e){

e.printStackTrace();

}

}



public void run() {

// TODO 自動產生方法 Stub



}

}



//yahoo網頁解析

class yahoohtmlparse

{

int htmlstart;

int htmlend;

String htmlstr;

String substrl,substrr;



int error = 0; //錯誤處理



int lpages = 0;

int rpages = 0;

int page[] = new int[100];



int linkcount = 0;

int titlecount = 0;

int articlecount = 0;

String titles[] = new String[100];

String articles[] = new String[100];

String links[] = new String[100];



public void init()

{

htmlstart = 0;

htmlend = 0;

htmlstr ="";

substrl="";

substrr="";



lpages = 0;

rpages = 0;



error =0;



linkcount = 0;

titlecount = 0;

articlecount = 0;



for (int i=0;i
{

links[i]="";

titles[i]="";

articles[i]="";

page[i]=0;

}

}



//預設狀態

public void htmlparse()

{

htmlstart=0;

htmlend=0;

htmlstr="";



error =0;

linkcount=0;

titlecount=0;

articlecount=0;





}

//尋找所需要的網頁內容

public String html(String str)

{

int indexl = 0;

int endl = 0;

int indexr = 0;

int endr = 0;

init();

//設定parse條件

String stagl="刊登贊助網站";

String etagl="
";

String stagr="
";

String etagr="
";

System.out.println("str:\n"+str);



//尋找字元位置

indexl = str.indexOf(stagl);

endl = str.indexOf(etagl);

indexr = str.indexOf(stagr,indexl+stagl.length());

endr = str.indexOf(etagr,indexr+stagr.length());



System.out.println("stagl:"+indexl);

System.out.println("etagl:"+endl);

System.out.println("stagr:"+indexr);

System.out.println("etagr:"+endr);



substrl="";



if (indexl != -1)

{

substrl = str.substring(indexl,endl);



System.out.println("substrl:\n"+substrl);

Lsubstr(substrl);



if (indexr > 0) //右邊廣告存在

{



substrr = str.substring(indexr,endr);

System.out.println("substrr\n"+substrr);



Rsubstr(substrr);

}

}

else

{

error =1;

}

htmlstr = substrl;





return htmlstr;

}

//處理左邊廣告

public boolean Lsubstr(String str)

{



Llink(str);

Ltitle(str);

Larticle(str);

return true;

}

//處理右邊廣告

public boolean Rsubstr(String str)

{



Rlink(str);

Rtitle(str);

Rarticle(str);

return true;

}

//處理L_title

public String Ltitle(String str)

{

String start="\">";

String end="";

String title="";

int index= 0;

int to = 0;



//titles[count] = title;

System.out.println("Ltitle:"+str);



index = str.indexOf(start,index);

to = str.indexOf(end,index);

while (index >0 ) //尋找title字串及計數次數

{



System.out.println("index:"+index);

System.out.println("to:"+to);

title = str.substring(index+start.length(),to);

System.out.println("title:"+title);

title = title.replace("","");

title = title.replace("
","");

titles[titlecount] = title;

index += start.length();

index = str.indexOf(start,index);

to = str.indexOf(end,index);

titlecount++;

}



//System.out.println(count);



return str;

}

//處理L_link

public String Llink(String str)

{

String start="";

String end="
";

String link="";

int index= 0;

int to = 0;

int count = 0;



System.out.println("B-str:"+str);

index = str.indexOf(start);

to = str.indexOf(end,index);

while (index >0 ) //尋找link字串及計數次數

{



System.out.println("index:"+index);

System.out.println("to:"+to);

link = str.substring(index+start.length(),to);

System.out.println("link:"+link);



links[linkcount] = link;



index += start.length();

index = str.indexOf(start,index);

to = str.indexOf(end,index);

linkcount++;



count++;

}



page[lpages] = count;

lpages++;



System.out.println("linkcount:"+linkcount);



return str;

}



//處理L_article

public String Larticle(String str)

{



String start="
";

String end="";

String article="";



int index= 0;

int to = 0;



index = str.indexOf(start);

System.out.println("B-str:"+str);

index = str.indexOf(start);

to = str.indexOf(end,index);

while (index >0 ) //尋找article字串及計數次數

{



System.out.println("index:"+index);

System.out.println("to:"+to);

article = str.substring(index+start.length(),to);

System.out.println("article:"+article);

article = article.replace("","");

article = article.replace("
","");

articles[articlecount] = article;

index += start.length();

index = str.indexOf(start,index);

to = str.indexOf(end,index);

articlecount++;

}



//System.out.println(count);

return str;

}



// 處理R_title

public String Rtitle(String str)

{

String start="\">";

String end="";

String begin="贊助網站";



String title="";

int index= 0;

int to = 0;



//titles[count] = title;

System.out.println("Rtitle:"+str);

index = str.indexOf(begin);

//to = str.indexOf(bend);

//str = str.substring(index,to);

//System.out.println("RBtitle:"+str);

index += begin.length();

System.out.println("begin:"+index);

index = str.indexOf(start,index);



to = str.indexOf(end,index);

while (index >0 ) //尋找title字串及計數次數

{



System.out.println("index:"+index);

System.out.println("to:"+to);

title = str.substring(index+start.length(),to);



title = title.replace("","");

title = title.replace("
","");

System.out.println("title:"+title);

if (title.equals("刊登贊助網站")== true)

break;

titles[titlecount] = title;

index += start.length();

index = str.indexOf(start,index);



to = str.indexOf(end,index);



titlecount++;



}



//System.out.println(count);



return str;

}



//處理R_link

public String Rlink(String str)

{

String start="";

String end="
";

String link="";

int index= 0;

int to = 0;

int count = 0;



System.out.println("B-str:"+str);

index = str.indexOf(start);

to = str.indexOf(end,index);

while (index >0 ) //尋找link字串及計數次數

{



System.out.println("index:"+index);

System.out.println("to:"+to);

link = str.substring(index+start.length(),to);

System.out.println("link:"+link);



links[linkcount] = link;



index += start.length();

index = str.indexOf(start,index);

to = str.indexOf(end,index);

linkcount++;

count++;

}

page[rpages] = page[rpages] +count;

rpages++;



//linkcount = count;

System.out.println("linkcount:"+linkcount);



return str;

}



//處理R_article

public String Rarticle(String str)

{



String start="
";

String end="
";

String article="";

int index= 0;

int to = 0;



System.out.println("B-str:"+str);

index = str.indexOf(start);

to = str.indexOf(end,index);

while (index >0 ) //尋找article字串及計數次數

{



System.out.println("index:"+index);

System.out.println("to:"+to);

article = str.substring(index+start.length(),to);

System.out.println("article:"+article);

article = article.replace("","");

article = article.replace("
","");

articles[articlecount] = article;

index += start.length();

index = str.indexOf(start,index);

to = str.indexOf(end,index);

articlecount++;

}



//System.out.println(count);

return str;

}

}