import java.io.*;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.text.SimpleDateFormat;
import java.util.*;
import java.net.*;
import org.jdom.*;
import org.jdom.xpath.*;
import org.jdom.input.*;
import com.e104.util.*;
import com.e104.util.E104Mail;
import com.e104.db.*;
/**
* @author kevin.huang
* @email kevin.huang@104.com.tw
* @description 解析xml檔程式
* @date 2006/9/18
* @description 傳送Email、寫log檔、搜詢資料庫
* @date 2006/9/19
* @description 連接網頁內容取回部份內容值
* @date 2006/9/20
* @description 解析google網頁完成
* @date 2006/9/26
* @description 資料寫入資料庫成功
* @date 2006/9/28
* @description 增加防錯測試
* @date 2006/10/1
*/
public class Google
{
public static void main(String[] args) throws Exception
{
String log = null;
//儲存所需的正確資料(已排除重覆)
String clinks[] = new String[100];
String ctitles[] = new String[100];
String carticles[] = new String[100];
global g = new global();
local l = new local();
//建立資料夾
File myDir = new File("C:\\Documents and Settings\\kevin.huang\\My Documents\\workspace\\google\\log");
myDir.mkdir();
System.out.println(myDir+(myDir.isDirectory()?" is":" is not")+" a directory.");
//建檔
String dirName = "C:\\Documents and Settings\\kevin.huang\\My Documents\\workspace\\google\\log";
String fileName = "google_" + new SimpleDateFormat("yyyyMMddHHmmss").format(new Date()) + ".log";
File output = new File (dirName,fileName);
output.createNewFile();
//System.out.println(output.getPath());
FileWriter fw = new FileWriter(output.getPath(),true);
PrintWriter pw = new PrintWriter(fw);
E104Mail logmail = new E104Mail(); //定義寄送mail物件
//資料庫初始設定
//搜尋時用
E104Conn queryInfoDB = null; // 建立會用到的 connection
E104Data dataInfo = null; // 建立會用到的 query容器
String sqlCommand="";
//寫入時用
E104Conn wqueryInfoDB = null; // 建立會用到的 connection
E104Data wdataInfo = null; // 建立會用到的 query容器
String wsqlCommand="";
//找最大值
E104Conn fqueryInfoDB = null; // 建立會用到的 connection
E104Data fdataInfo = null; // 建立會用到的 query容器
String fsqlCommand="";
try
{
//xml解析
log = g.startparse();
l.startparse();
System.out.println("logpath="+log);
//寫檔內容
pw.append("機制於"+new SimpleDateFormat("yyyy/MM/dd HH:mm:ss").format(new Date())+"啟動,其完成動作如下:\n");
//初始化郵件
l.MailHost="ex01.e104.com.tw"; //設定SMTP
logmail.setHost(l.MailHost);
logmail.setFrom(l.MailFrom);
logmail.setTo(l.MailTo);
//logmail.setBcc(l.MailCc);
logmail.setSubject("test google search keyword...");
logmail.appendBody("機制於"+new SimpleDateFormat("yyyy/MM/dd HH:mm:ss").format(new Date())+"啟動,其完成動作如下:\n");
int countermax = Integer.parseInt(args[0]); //限制條件
//上線條件要改
sqlCommand = "SELECT autonumber, keyword, counter, apstart"+
" FROM info_keylog"+
" where "+
//" WHERE (keyword_group IS NOT NULL)"+
//" and "+
//" keyword_group '104boss'"+
//" counter
//" and rownum
" apstart = 1"+
" ORDER BY counter DESC";
queryInfoDB = new E104Conn(g.driver,g.database,g.username,g.password,false);
System.out.println("連結INFO資料庫成功...");
logmail.appendBody("連結INFO資料庫成功...\n");
//寫檔內容
pw.append("連結INFO資料庫成功...\n");
//撈取資料
System.out.println(sqlCommand);
logmail.appendBody("sqlCommand:\n"+sqlCommand+"\n");
pw.append("sqlCommand:\n"+sqlCommand+"\n");
queryInfoDB.setSql(sqlCommand);
dataInfo = queryInfoDB.getData();
System.out.println("共"+dataInfo.getRowCount()+"筆\n資料如下(autonumber,keyword,counter)");
logmail.appendBody("共"+dataInfo.getRowCount()+"筆\n資料如下(autonumber,keyword,counter)\n");
pw.append("共"+dataInfo.getRowCount()+"筆\n資料如下:\nautonumber keyword counter\n");
//顯示資料庫中的欄位值
if(dataInfo.getRowCount()!= 0)
for (int i=0;i
{
System.out.print(dataInfo.getCell(i,"autonumber")+" ");
System.out.print(dataInfo.getCell(i,"keyword")+" ");
System.out.println(dataInfo.getCell(i,"counter")); //取欄位值
logmail.appendBody(dataInfo.getCell(i,"autonumber")+" ");
logmail.appendBody(dataInfo.getCell(i,"keyword")+" ");
logmail.appendBody(dataInfo.getCell(i,"counter")+"\n");
pw.append(dataInfo.getCell(i,"autonumber")+" ");
pw.append(dataInfo.getCell(i,"keyword")+" ");
pw.append(dataInfo.getCell(i,"counter")+"\n");
}
System.out.println("開始取得網頁內容:");
logmail.appendBody("開始取得網頁內容:\n");
pw.append("開始取得網頁內容:\n");
//設proxy
System.getProperties().put("proxySet","true");
System.getProperties().put("proxyHost","http://proxy.hinet.net");
System.getProperties().put("proxyPort","80");
System.out.println("proxySet:"+System.getProperties().getProperty("proxySet"));
System.out.println("proxyHost:"+System.getProperties().getProperty("proxyHost"));
System.out.println("proxyPort:"+System.getProperties().getProperty("proxyPort"));
//google連接網頁
urlgoogle google = new urlgoogle();
String keyword="";
googlehtmlparse googlehp = new googlehtmlparse();
Charcode code = new Charcode();
//System.out.println("音樂:"+code.Utf8URLencode("音樂"));
String wkeyword = "";
int wcounter = 0;
//判斷跑幾筆
int total=dataInfo.getRowCount();
if (total > 40)
total =40;
for (int i=0;i
{
googlehp.init();
//keyword = dataInfo.getCell(i,"keyword");
for(int j=0; j
{
int page=j*10;
keyword = dataInfo.getCell(i,"keyword");
wcounter = Integer.parseInt(dataInfo.getCell(i,"counter"));
wkeyword = keyword;
keyword = code.Utf8URLencode(keyword);
System.out.println("keyword:"+dataInfo.getCell(i,"keyword")+"-"+keyword);
logmail.appendBody("keyword:"+dataInfo.getCell(i,"keyword")+"-"+keyword+"\n");
pw.append("keyword:"+dataInfo.getCell(i,"keyword")+"-"+keyword+"\n");
keyword="http://www.google.com.tw/search?hl=zh-TW&q="+keyword+"&start="+page;
System.out.println("keyword:"+keyword);
logmail.appendBody("keyword:"+keyword+"\n");
pw.append("keyword:"+keyword+"\n");
google.connect(keyword);
google.readContents();
System.out.println("google.wordcount:"+google.wordcount);
logmail.appendBody("google.wordcount:"+google.wordcount+"\n");
pw.append("google.wordcount:"+google.wordcount+"\n");
System.out.println("開始搜尋第"+(j+1)+"頁");
logmail.appendBody("開始搜尋第"+(j+1)+"頁\n");
pw.append("開始搜尋第"+(j+1)+"頁\n");
googlehp.html(google.htmlstr);
System.out.println("error:"+googlehp.error);
logmail.appendBody("error:"+googlehp.error+"\n");
pw.append("error:"+googlehp.error+"\n");
if (googlehp.error == 1)
break;
}
System.out.println("");
logmail.appendBody("\n");
pw.append("\n");
for (int k=0;k
{
System.out.println("links["+k+"]:"+googlehp.links[k]);
System.out.println("titles["+k+"]:"+googlehp.titles[k]);
System.out.println("articles["+k+"]:"+googlehp.articles[k]);
logmail.appendBody("links["+k+"]:"+googlehp.links[k]+"\n");
logmail.appendBody("titles["+k+"]:"+googlehp.titles[k]+"\n");
logmail.appendBody("articles["+k+"]:"+googlehp.articles[k]+"\n");
pw.append("links["+k+"]:"+googlehp.links[k]+"\n");
pw.append("titles["+k+"]:"+googlehp.titles[k]+"\n");
pw.append("articles["+k+"]:"+googlehp.articles[k]+"\n");
}
//每頁多少筆
int kp =0;
for (int k=0;k
{
kp = k + 1;
System.out.println("");
logmail.appendBody("\n");
pw.append("\n");
}
//判斷所需字串
int count=0;
for (int z=0;z
{
clinks[z]="";
ctitles[z]="";
carticles[z]="";
}
//排除重覆的資料
System.out.println("linkcount:"+googlehp.linkcount);
logmail.appendBody("linkcount:"+googlehp.linkcount+"\n");
pw.append("linkcount:"+googlehp.linkcount+"\n");
for (int x=0;x
{
for (int y=0;y
{
if (clinks[y].equals("")) //clinks[y]為""時
{
clinks[y]=googlehp.links[x];
ctitles[y]=googlehp.titles[x];
carticles[y]=googlehp.articles[x];
count=count+1;
System.out.println("clinks["+y+"]"+clinks[y]);
logmail.appendBody("clinks["+y+"]"+clinks[y]+"\n");
pw.append("clinks["+y+"]"+clinks[y]+"\n");
System.out.println("ctitles["+y+"]"+ctitles[y]);
logmail.appendBody("ctitles["+y+"]"+ctitles[y]+"\n");
pw.append("ctitles["+y+"]"+ctitles[y]+"\n");
System.out.println("carticles["+y+"]"+carticles[y]);
logmail.appendBody("carticles["+y+"]"+carticles[y]+"\n");
pw.append("carticles["+y+"]"+carticles[y]+"\n");
break;
}
else //clinks[y]有值時
{
if (googlehp.links[x].equals(clinks[y])) //一樣
{
break;
}
else //不一樣
{
continue;
}
}
}
}
System.out.println("count="+count);
logmail.appendBody("count="+count+"\n");
pw.append("count="+count+"\n");
System.out.println("");
logmail.appendBody(""+"\n");
pw.append(""+"\n");
int sum = count;
int to = 0;
//設定資料庫連結
wqueryInfoDB = new E104Conn(g.driver,g.database,g.username,g.password,false);
fqueryInfoDB = new E104Conn(g.driver,g.database,g.username,g.password,false);
System.out.println("連結INFO資料庫成功...");
logmail.appendBody("連結INFO資料庫成功...\n");
pw.append("連結INFO資料庫成功...\n");
//資料處理
for (int p=0;p
{
int pcount =0; //計算頁數值
int change =0; //計算目前筆數
pcount=p+1;
System.out.println("第"+pcount+"頁");
logmail.appendBody("第"+pcount+"頁\n");
pw.append("第"+pcount+"頁\n");
if (sum == -1) //sum>=0代表還有資料,-1表結束
break;
if (sum > 0)
{
sum = sum - googlehp.page[p];
if (sum
sum = 0;
}
System.out.println("sum="+sum);
logmail.appendBody("sum="+sum+"\n");
pw.append("sum="+sum+"\n");
if (sum >=0 || sum != -1)
{
for (int c=0;c
{
if (to == count)
{
System.out.println("結束");
logmail.appendBody("結束\n");
pw.append("結束\n");
break;
}
int slot = c+1; //資料庫排序值
System.out.println("筆數"+change);
logmail.appendBody("筆數"+change+"\n");
pw.append("筆數"+change+"\n");
if (change == googlehp.page[p]) //記算筆數是否為頁筆數
{
System.out.println("換頁");
logmail.appendBody("換頁\n");
pw.append("換頁\n");
break;
}
System.out.println("clinks["+to+"]="+clinks[to]);
System.out.println("ctitles["+to+"]="+ctitles[to]);
System.out.println("carticles["+to+"]="+carticles[to]);
logmail.appendBody("\nclinks["+to+"]="+clinks[to]);
logmail.appendBody("\nctitles["+to+"]="+ctitles[to]);
logmail.appendBody("\ncarticles["+to+"]="+carticles[to]);
pw.append("\nclinks["+to+"]="+clinks[to]);
pw.append("\nctitles["+to+"]="+ctitles[to]);
pw.append("\ncarticles["+to+"]="+carticles[to]);
//找最大流水號
fsqlCommand =
"select max(auto_no) as mx from over_monitor";
System.out.println("Find:"+fsqlCommand);
//logmail.appendBody("Find:\n"+fsqlCommand+"\n");
//pw.append("Find:\n"+fsqlCommand+"\n");
fqueryInfoDB.setSql(fsqlCommand);
fdataInfo = fqueryInfoDB.getData();
//System.out.println(fdataInfo.getColumnValues(0));
System.out.println("auto_no值:"+fdataInfo.getColumnValues(0));
int max =Integer.parseInt(fdataInfo.getCell(0,"mx"))+1;
System.out.println("max:"+max);
logmail.appendBody("max:"+max);
pw.append("max:"+max);
//寫入資料庫
//max=max+1;
//result = new String(text.getBytes("GB2312"), "ISO-8859-1");
//clinks[to] = new String(clinks[to].getBytes())
System.out.println("max:"+max);
System.out.println("domain_name:"+clinks[to]);
System.out.println("keyword:"+wkeyword);
System.out.println("pagenum:"+pcount);
System.out.println("slot:"+slot);
System.out.println("adtitle:"+ctitles[to]);
System.out.println("adtext:"+carticles[to]);
System.out.println("engine:g");
System.out.println("create_date:syadate");
System.out.println("counter:"+wcounter);
wsqlCommand =
"insert into over_monitor "+
" (auto_no,domain_name,keyword,pagenum,slot,adtitle,adtext,engine,create_date,counter)"+
" values("+max+" ,'"+clinks[to]+"' ,'"+wkeyword+"' ,"+pcount+" ,"+slot+" ,'"+ctitles[to]+"' ,'"+carticles[to]+"' ,'g' ,sysdate,"+wcounter+")";
to=to+1;
//撈取資料
System.out.println("keyword:"+wkeyword);
logmail.appendBody("keyword:"+wkeyword+"\n");
pw.append("keyword:"+wkeyword+"\n");
System.out.println("write:"+wsqlCommand);
logmail.appendBody("write:\n"+wsqlCommand+"\n");
pw.append("write:\n"+wsqlCommand+"\n");
change=change+1;
//System.out.println("begin");
wqueryInfoDB.setSql(wsqlCommand);
//wqueryInfoDB.executeUpdate(wsqlCommand);
//System.out.println("setsql");
wdataInfo = wqueryInfoDB.getData();
//System.out.println("wdatainfo");
//System.out.println("共"+wdataInfo.getRowCount()+"筆");
}
if (sum == 0)
sum=-1;
}
}
System.out.println("");
logmail.appendBody("\n");
pw.append("\n");
count =0;
googlehp.init();
//System.out.println("error:"+googlehp.error);
//logmail.appendBody("error:"+googlehp.error+"\n");
//pw.append("error:"+googlehp.error+"\n");
}
//out.write("htmlstr:"+hp.htmlstr);
}
catch (Exception e)
{
System.out.println(e.getMessage());
System.out.println("系統有誤未完成,請通知INFO的SA!!");
System.out.println("Message="+e.getMessage());
System.out.println("Exception="+e.toString());
e.printStackTrace();
logmail.appendBody("系統有誤未完成,請通知INFO的SA!!\n");
logmail.appendBody("Message="+e.getMessage()+"\n");
logmail.appendBody("Exception="+e.toString()+"\n");
pw.append("系統有誤未完成,請通知INFO的SA!!\n");
pw.append("Message="+e.getMessage()+"\n");
pw.append("Exception="+e.toString()+"\n");
}
finally
{
//關閉連線,釋放資源
System.out.println("釋放所有資源!!");
logmail.appendBody("釋放所有資源!!\n");
pw.append("釋放所有資源!!\n");
System.out.println("機制於"+new SimpleDateFormat("yyyy/MM/dd HH:mm:ss").format(new Date())+"結束");
logmail.appendBody("機制於"+new SimpleDateFormat("yyyy/MM/dd HH:mm:ss").format(new Date())+"結束\n");
pw.append("機制於"+new SimpleDateFormat("yyyy/MM/dd HH:mm:ss").format(new Date())+"結束\n");
//寄信
logmail.send();
dataInfo = null;
logmail = null;
g = null;
l = null;
log = null;
sqlCommand = null;
wsqlCommand = null;
queryInfoDB.close();
wqueryInfoDB.close();
//fqueryInfoDB.close();
pw.close();
fw.close();
}
}
}
//big5轉url編碼
class Charcode {
/**
* 轉換編碼 ISO-8859-1到GB2312
* @param text
* @return
*/
public String ISO2GB(String text) {
String result = "";
try {
result = new String(text.getBytes("ISO-8859-1"), "GB2312");
}
catch (UnsupportedEncodingException ex) {
result = ex.toString();
}
return result;
}
/**
* 轉換編碼 GB2312到ISO-8859-1
* @param text
* @return
*/
public String GB2ISO(String text) {
String result = "";
try {
result = new String(text.getBytes("GB2312"), "ISO-8859-1");
}
catch (UnsupportedEncodingException ex) {
ex.printStackTrace();
}
return result;
}
/**
* Utf8URL編碼
* @param s
* @return
*/
public String Utf8URLencode(String text) {
StringBuffer result = new StringBuffer();
for (int i = 0; i
char c = text.charAt(i);
if (c >= 0 && c
result.append(c);
}else {
byte[] b = new byte[0];
try {
b = Character.toString(c).getBytes("UTF-8");
}catch (Exception ex) {
}
for (int j = 0; j
int k = b[j];
if (k
result.append("%" + Integer.toHexString(k).toUpperCase());
}
}
}
return result.toString();
}
/**
* Utf8URL解碼
* @param text
* @return
*/
public String Utf8URLdecode(String text) {
String result = "";
int p = 0;
if (text!=null && text.length()>0){
text = text.toLowerCase();
p = text.indexOf("%e");
if (p == -1) return text;
while (p != -1) {
result += text.substring(0, p);
text = text.substring(p, text.length());
if (text == "" || text.length()
result += CodeToWord(text.substring(0, 9));
text = text.substring(9, text.length());
p = text.indexOf("%e");
}
}
return result + text;
}
/**
* utf8URL編碼轉字符
* @param text
* @return
*/
private String CodeToWord(String text) {
String result;
if (Utf8codeCheck(text)) {
byte[] code = new byte[3];
code[0] = (byte) (Integer.parseInt(text.substring(1, 3), 16) - 256);
code[1] = (byte) (Integer.parseInt(text.substring(4, 6), 16) - 256);
code[2] = (byte) (Integer.parseInt(text.substring(7, 9), 16) - 256);
try {
result = new String(code, "UTF-8");
}catch (UnsupportedEncodingException ex) {
result = null;
}
}
else {
result = text;
}
return result;
}
/**
* 編碼是否有效
* @param text
* @return
*/
private boolean Utf8codeCheck(String text){
String sign = "";
if (text.startsWith("%e"))
for (int i = 0, p = 0; p != -1; i++) {
p = text.indexOf("%", p);
if (p != -1)
p++;
sign += p;
}
return sign.equals("147-1");
}
/**
* 是否Utf8Url編碼
* @param text
* @return
*/
public boolean isUtf8Url(String text) {
text = text.toLowerCase();
int p = text.indexOf("%");
if (p != -1 && text.length() - p > 9) {
text = text.substring(p, p + 9);
}
return Utf8codeCheck(text);
}
}
class global //處理global.xml
{
// 設定參數
String service="";
String logpath="";
int delay;
String service_des="";
String name="";
String driver="";
String database="";
String username="";
String password="";
int apname;
public String startparse() throws Exception
{
String filename = "global.xml";//xml檔名
SAXBuilder builder = new SAXBuilder();
Document doc = builder.build(new File(filename));//得到Document對像
Element root = doc.getRootElement(); //獲得根元素
XPath xpath = XPath.newInstance("//*"); //解析開始位置
List list = xpath.selectNodes(root); //設定開始節點
Iterator iter = list.iterator();
while (iter.hasNext()) {
Element item = (Element) iter.next();
//System.out.print(item);
System.out.print(item.getName()+":");
String tname =item.getName();
String vname =item.getText();
//判斷節點值
if (tname.equals("ap")) //取ap的name屬性值
{
System.out.print("name="+item.getAttributeValue("name")+" ");
String chk=item.getAttributeValue("name");
apname = Integer.parseInt(chk); //String轉int
}
if (tname.equals("service"))
service =vname;
if (tname.equals("logpath"))
{
logpath =vname;
System.out.println(logpath);
}
if (tname.equals("delay"))
{
tname =item.getText();
delay =Integer.parseInt(tname);
}
if (tname.equals("service_des"))
service_des =vname;
if (tname.equals("name"))
name =vname;
if (tname.equals("driver"))
driver =vname;
if (tname.equals("database"))
database =vname;
if (tname.equals("username"))
username =vname;
if (tname.equals("password"))
password =vname;
//System.out.println(item.getText());
//System.err.println(item.getText());
}
return logpath;
}
}
class local //處理local.xml
{
// 設定參數
String GlobalPtah="";
String EdmPathFree="";
String EdmPathPay="";
String ServerPath="";
String MailHost="";
String MailFrom="";
String MailTo="";
String MailCc="";
int Sql;
public void startparse() throws Exception
{
String filename = "local.xml";//xml檔名
SAXBuilder builder = new SAXBuilder();
Document doc = builder.build(new File(filename));//得到Document對像
Element root = doc.getRootElement(); //獲得根元素
XPath xpath = XPath.newInstance("//*"); //解析開始位置
List list = xpath.selectNodes(root); //設定開始節點
Iterator iter = list.iterator();
while (iter.hasNext()) {
Element item = (Element) iter.next();
//System.out.print(item);
System.out.print(item.getName()+":");
String tname =item.getName();
String vname =item.getText();
//判斷節點值
if (tname.equals("GlobalPtah"))
GlobalPtah =vname;
if (tname.equals("EdmPathFree"))
EdmPathFree =vname;
if (tname.equals("ServerPath"))
ServerPath =vname;
if (tname.equals("MailHost"))
MailHost =vname;
if (tname.equals("MailFrom"))
MailFrom =vname;
if (tname.equals("MailTo"))
MailTo =vname;
if (tname.equals("MailTo"))
MailTo =vname;
if (tname.equals("Sql"))
{
tname =item.getText();
Sql =Integer.parseInt(tname);
}
System.out.println(item.getText());
//System.err.println(item.getText());
}
}
}
//連接google網頁取得內容
class urlgoogle implements Runnable
{
String htmlstr=""; //儲存網頁內容
int wordcount=0; //計算字數
URLConnection URLConn;
public void connect( String urlString ) {
try {
URL url = new URL(urlString);
//connection = url.openConnection();
URLConn = (HttpURLConnection)url.openConnection();
URLConn.setRequestProperty("User-agent","IE/6.0");
URLConn.setConnectTimeout(3000);
URLConn.setReadTimeout(3000);
} catch (MalformedURLException e){
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} catch (Exception e){
e.printStackTrace();
}
}
public void readContents() throws Exception {
try {
BufferedReader in=new BufferedReader(new InputStreamReader(URLConn.getInputStream()));
String inputLine;
htmlstr="";
while (
(inputLine = in.readLine()) != null) {
//System.out.println(inputLine);
htmlstr=htmlstr+inputLine;
wordcount = wordcount +inputLine.length();
}
Thread.sleep(5000);
//System.out.println("wordcount:"+wordcount);
//System.out.println("htmlstr:\n"+htmlstr);
//System.out.println("htmlstrcount:"+htmlstr.length());
} catch (IOException e) {
e.printStackTrace();
} catch (Exception e){
e.printStackTrace();
}
}
public void run() {
// TODO 自動產生方法 Stub
}
}
//google網頁解析
class googlehtmlparse
{
int htmlstart;
int htmlend;
String htmlstr;
String substrl,substrr;
int error = 0; //錯誤處理
int lpages = 0;
int rpages = 0;
int page[] = new int[100];
int linkcount = 0;
int titlecount = 0;
int articlecount = 0;
String titles[] = new String[100];
String articles[] = new String[100];
String links[] = new String[100];
public void init()
{
htmlstart = 0;
htmlend = 0;
htmlstr ="";
substrl="";
substrr="";
lpages = 0;
rpages = 0;
error =0;
linkcount = 0;
titlecount = 0;
articlecount = 0;
for (int i=0;i
{
links[i]="";
titles[i]="";
articles[i]="";
page[i]=0;
}
}
//預設狀態
public void htmlparse()
{
htmlstart=0;
htmlend=0;
htmlstr="";
linkcount=0;
titlecount=0;
articlecount=0;
error = 0;
}
//尋找所需要的網頁內容
public String html(String str)
{
int indexl = 0;
int indexr = 0;
int endr = 0;
init();
//設定parse條件
String stagl="贊助商連結";
String stagr="贊助商連結";
String etagr="";
System.out.println("str:\n"+str);
//尋找字元位置
//indexl = str.indexOf(stagl);
//endl = str.indexOf(etagl);
indexr = str.indexOf(stagr,indexl+stagl.length());
endr = str.indexOf(etagr,indexr+stagr.length());
//System.out.println("stagl:"+indexl);
//System.out.println("etagl:"+endl);
System.out.println("stagr:"+indexr);
System.out.println("etagr:"+endr);
//substrl = str.substring(indexl,endl);
endr=endr+etagr.length();
//System.out.println("substrl:\n"+substrl);
//Lsubstr(substrl);
if (indexr > 0) //右邊廣告存在
{
substrr = str.substring(indexr,endr);
System.out.println("substrr\n"+substrr);
Rsubstr(substrr);
}
else
{
error =1;
}
htmlstr = substrl;
return htmlstr;
}
//處理右邊廣告
public boolean Rsubstr(String str)
{
Rlink(str);
Rtitle(str);
Rarticle(str);
return true;
}
// 處理R_title
public String Rtitle(String str)
{
String start="";
String end="
";
String title="";
int index= 0;
int to = 0;
System.out.println("B-str:"+str);
index = str.indexOf(start,index);
to = str.indexOf(end,index);
while (index >0 ) //尋找title字串及計數次數
{
System.out.println("index:"+index);
System.out.println("to:"+to);
title = str.substring(index+start.length(),to);
System.out.println("title:"+title);
title = title.replace("","");
title = title.replace("","");
titles[titlecount] = title;
index += start.length();
index = str.indexOf(start,index);
to = str.indexOf(end,index);
titlecount++;
}
//System.out.println(count);
return str;
}
//處理R_link
public String Rlink(String str)
{
String start="";
String end="";
String link="";
int index= 0;
int to = 0;
int count = 0;
System.out.println("B-str:"+str);
index = str.indexOf(start);
to = str.indexOf(end,index);
while (index >0 ) //尋找link字串及計數次數
{
System.out.println("index:"+index);
System.out.println("to:"+to);
link = str.substring(index+start.length(),to);
System.out.println("link:"+link);
links[linkcount] = link;
index += start.length();
index = str.indexOf(start,index);
to = str.indexOf(end,index);
linkcount++;
count++;
}
page[rpages] = count;
rpages++;
//linkcount = count;
System.out.println("linkcount:"+linkcount);
return str;
}
//處理R_article
public String Rarticle(String str)
{
String start="
";
String end="
";
String article="";
int index= 0;
int to = 0;
System.out.println("B-str:"+str);
index = str.indexOf(start);
to = str.indexOf(end,index);
while (index >0 ) //尋找article字串及計數次數
{
System.out.println("index:"+index);
System.out.println("to:"+to);
article = str.substring(index+start.length(),to);
System.out.println("article:"+article);
article = article.replace("","");
article = article.replace("","");
article = article.replace("
","");
articles[articlecount] = article;
index += start.length();
index = str.indexOf(start,index);
to = str.indexOf(end,index);
articlecount++;
}
//System.out.println(count);
return str;
}
}
- Oct 04 Wed 2006 11:00
google資料抓不到嗎?看這就對了^^
全站熱搜
留言列表
發表留言