java開發(fā)項目集錦(附源碼)Word版

上傳人：每**** 文檔編號：52157201 上傳時間：2022-02-07 格式：DOC 頁數(shù)：70 大?。?34.50KB

收藏版權(quán)申訴舉報下載

第1頁 / 共70頁

第2頁 / 共70頁

第3頁 / 共70頁

下載文檔到電腦，查找使用更方便

0 積分

下載資源

還剩頁未讀，繼續(xù)閱讀

資源描述：

《java開發(fā)項目集錦(附源碼)Word版》由會員分享，可在線閱讀，更多相關(guān)《java開發(fā)項目集錦(附源碼)Word版（70頁珍藏版）》請在裝配圖網(wǎng)上搜索。

1、新浪天氣預(yù)報新聞java抓去程序 package .weather1; import java.io.BufferedReader; import java.io.ByteArrayOutputStream; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.PrintWriter; import .URL;

2、import .URLConnection; import java.util.regex.Matcher; import java.util.regex.Pattern; import mons.logging.Log; import mons.logging.LogFactory; import .update.Getdata; /** * 正則方式抓取新浪天氣新聞上的新聞 * 地址 * @param args */ public class Newlist { private static final Log log = LogFactory.ge

3、tLog(Newlist.class); /** * 測試 * @param args */ public static void main(String args[]){ Newlist n=new Newlist(); String[] k=n.getNewList(); for (int i=0;i

4、l=")); } String[] m=n.getNewinfo("news/2008/1119/35261.html"); for (int l=0;l

5、 * @return */ public String[] getNewinfo(String url){ String URL=" //30是指取30段滿足給出的正則條件的字符串，如果只找出10個，那數(shù)組后面的全為null String[] s = analysis("

(.*?)

" , getContent(URL) , 30); for (int i=0;i

6、"); Matcher matcher = sp.matcher(s[i]); if (matcher.find()){ String imageurl=analysis("src=\"(.*?)\"" , s[i] , 1)[0]; if(!imageurl.startsWith("http://")){ imageurl=" }

7、 System.out.println("新聞有圖片:"+imageurl); String content=getContent(imageurl); String[] images=imageurl.split("/"); String imagename=images[images.length-1]; System.out.println("圖片名:"+imagename);

8、 try { File fwl = new File(imagename); PrintWriter outl = new PrintWriter(fwl); outl.println(content); outl.close(); } catch (IOException e) { // TODO Auto-generated catch block e.

9、printStackTrace(); } System.out.println("s[i]:"+s[i]); //修改文件圖片地址 s[i]=s[i].replace(analysis("src=\"(.*?)\"" , s[i] , 1)[0], imagename); } } return s; } public String[] getNewList(){ 推薦精選

10、 String url=" return getNewList(getContent(url)); } private String[] getNewList(String content ){ //String[] s = analysis("align=\"center\" valign=\"top\">" , content , 50); String[]

11、 s = analysis("

(.*?)

" , content , 50); return s; } private String[] analysis(String pattern, String match , int i){ Pattern sp = Ppile(pattern); Matcher matcher = sp.matcher(match); String[] content = new String[i]; for (int i

12、1 = 0; matcher.find(); i1++){ content[i1] = matcher.group(1); } //下面一段是為了剔除為空的串 int l=0; for (int k=0;k

13、 String[] content2; if (l!=0){ content2=new String[l]; for (int n=0;n

14、址獲取網(wǎng)頁內(nèi)容 * @param strUrl * @return private String getContent(String strUrl){ try{ //URL url = new URL(strUrl); 推薦精選 //BufferedReader br = new BufferedReader(new InputStreamReader(url.openStream())); URLConnection uc = new URL(str

15、Url).openConnection(); //通過修改http頭的User-Agent來偽裝成是通過瀏覽器提交的請求 uc.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows XP; DigExt)"); System.out.println("--------------------

16、---------------------"); System.out.println("Content-Length: "+uc.getContentLength()); System.out.println("Set-Cookie: "+uc.getHeaderField("Set-Cookie")); System.out.println("-----------------------------------------"); //獲取文

17、件頭信息 System.out.println("Header"+uc.getHeaderFields().toString()); System.out.println("-----------------------------------------"); BufferedReader br=new BufferedReader(new InputStreamReader(uc.getInputStream(), "gb2312")); String s = "";

18、 StringBuffer sb=new StringBuffer(); while((s = br.readLine())!=null){ sb.append(s+"\r\n"); } System.out.println("長度+"+sb.toString().length()); return sb.toString(); }catch(Exception e){

19、 return "error open url" + strUrl; } } */ public static String getContent (String strUrl){ URLConnection uc = null; String all_content=null; try { all_content =new String(); URL url = new URL(strUrl);

20、 uc = url.openConnection(); uc.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows XP; DigExt)"); System.out.println("-----------------------------------------");

21、System.out.println("Content-Length: "+uc.getContentLength()); System.out.println("Set-Cookie: "+uc.getHeaderField("Set-Cookie")); 推薦精選 System.out.println("-----------------------------------------"); //獲取文件頭信息 System.out.pri

22、ntln("Header"+uc.getHeaderFields().toString()); System.out.println("-----------------------------------------"); if (uc == null) return null; InputStream ins = uc.getInputStream(); ByteArrayOutputStream outputstrea

23、m = new ByteArrayOutputStream(); byte[] str_b = new byte[1024]; int i = -1; while ((i=ins.read(str_b)) > 0) { outputstream.write(str_b,0,i); } all_content = outputstream.toString();

24、 // System.out.println(all_content); } catch (Exception e) { e.printStackTrace(); log.error("獲取網(wǎng)頁內(nèi)容出錯"); }finally{ uc = null; } // return new String(all_content.getBytes("ISO88

25、59-1")); System.out.println(all_content.length()); return all_content; } } 現(xiàn)在的問題是:圖片下載不全，我用后面兩種getContent方法下圖片，下來的圖片大小都和文件頭里獲得的Content-Length，也就是圖片的實際大小不符，預(yù)覽不了。而且反復(fù)測試，兩種方法每次下來的東西大小是固定的，所以重復(fù)下載沒有用？測試toString后length大小比圖片實際的小，而生成的圖片比圖片數(shù)據(jù)大。下載后存儲過程中圖片數(shù)據(jù)增

26、加了！圖片數(shù)據(jù)流toString過程中數(shù)據(jù)大小發(fā)生了改變，還原不回來。其它新聞內(nèi)容沒有問題。估計是圖片的編碼格式等的問題。在圖片數(shù)據(jù)流讀過來時直接生成圖片就可以了。 public int saveImage (String strUrl){ URLConnection uc = null; try { URL url = new URL(strUrl); uc = url.openConnection(); uc.setRequestProperty("Us

27、er-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows XP; DigExt)"); 推薦精選 //uc.setReadTimeout(30000); //獲取圖片長度 //System.out.println("Content-Length: "+uc.getContentLength()); //獲取文件頭信息 /

28、/System.out.println("Header"+uc.getHeaderFields().toString()); if (uc == null) return 0; InputStream ins = uc.getInputStream(); byte[] str_b = new byte[1024]; int byteRead=0;

29、 String[] images=strUrl.split("/"); String imagename=images[images.length-1]; File fwl = new File(imagename); FileOutputStream fos= new FileOutputStream(fwl); while ((byteRead=ins.read(str_b)) > 0) {

30、 fos.write(str_b,0,byteRead); }; fos.flush(); fos.close(); } catch (Exception e) { e.printStackTrace(); log.error("獲取網(wǎng)頁內(nèi)容出錯"); }finally{ uc = null;

31、 } return 1; } 方法二：首先把搜索后的頁面用流讀取出來，再寫個正則，去除不要的內(nèi)容，再把最后的結(jié)果存成xml格式文件、或者直接存入數(shù)據(jù)庫，用的時候再調(diào)用本代碼只是顯示html頁的源碼內(nèi)容，如果需要抽取內(nèi)容請自行改寫public static String regex()中的正則式 package rssTest; import java.io.BufferedReader; import java.io.IOException; import java.io.Inpu

32、tStreamReader; import .HttpURLConnection; import .MalformedURLException; 推薦精選 import .URL; import .URLConnection; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; public class MyRSS {

33、 /** * 獲取搜索結(jié)果的html源碼 * */ public static String getHtmlSource(String url) { StringBuffer codeBuffer = null; BufferedReader in=null; try { URLConnection uc = new URL(url).openConnection();

34、 /** * 為了限制客戶端不通過網(wǎng)頁直接讀取網(wǎng)頁內(nèi)容,就限制只能從瀏覽器提交請求. * 但是我們可以通過修改http頭的User-Agent來偽裝,這個代碼就是這個作用 * */ uc.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows XP; DigExt

35、)"); // 讀取url流內(nèi)容 in = new BufferedReader(new InputStreamReader(uc .getInputStream(), "gb2312")); codeBuffer = new StringBuffer(); String tempCode = ""; // 把buffer內(nèi)的值讀取出來,保存到code中

36、 while ((tempCode = in.readLine()) != null) { codeBuffer.append(tempCode).append("\n"); } in.close(); } catch (MalformedURLException e) 推薦精選 { e.printStackTrace();

37、 } catch (IOException e) { e.printStackTrace(); } return codeBuffer.toString(); } /** * 正則表達式 * */ public static String regex() { String googleRegex = "

38、ss=g>(.*?)href=\"(.*?)\"(.*?)\">(.*?)(.*?)

(.*?)
"; return googleRegex; } /** * 測試用 * 在google中檢索關(guān)鍵字，并抽取自己想要的內(nèi)容 * * */ public static List GetNews() { List newsList = new Ar

39、rayList(); String allHtmlSource = MyRSS .getHtmlSource(" maxthon&hs=SUZ&q=%E8%A7%81%E9%BE%99%E5%8D%B8%E7%94%B2&meta=&aq=f"); Pattern pattern = Ppile(regex()); Matcher matcher = pattern.matcher(allHtmlSource); while (mat

40、cher.find()) { String urlLink = matcher.group(2); String title = matcher.group(4); title = title.replaceAll("", ""); title = title.replaceAll("", ""); title = title.replaceAll("..

41、.", ""); 推薦精選 String content = matcher.group(6); content = content.replaceAll("", ""); content = content.replaceAll("", ""); content = content.replaceAll("...", ""); newsLi

42、st.add(urlLink); newsList.add(title); newsList.add(content); } return newsList; } /** * main方法 * */ public static void main(String[] args) { System.out .println(MyRSS

43、 .getHtmlSource(" } } 方法三： jsp自動抓取新聞自動抓取新聞 package com.news.spider; import java.io.File; import java.io.FileFilter; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Calendar; import java.util.Date; import java.util.List; import

44、 java.util.regex.Matcher; import java.util.regex.Pattern; import com.db.DBAccess; public class SpiderNewsServer { public static void main(String[] args) throws Exception{ //設(shè)置抓取信息的首頁面 String endPointUrl = " //獲得當(dāng)前時間推薦精選 Calendar calendar=Calendar.getInstance(); Sim

45、pleDateFormat sdf=new SimpleDateFormat("yyyy-MM-dd"); String DateNews = sdf.format(calendar.getTime()); /******************** * 抓取二級URl 開始 * url匹配類型：" */ List listNewsType = new ArrayList(); //取入口頁面html WebHtml webHtml = new WebHtml(); String htmlDocue

46、mtnt1 = webHtml.getWebHtml(endPointUrl); if(htmlDocuemtnt1 == null || htmlDocuemtnt1.length() == 0){ return; } String strTemp1 = " String strTemp2 = ""; int stopIndex=0; int startIndex=0; int dd=0; while(true){ dd++; startIndex = htmlDocuemtnt1.ind

47、exOf(strTemp1, stopIndex); System.out.println("=========="+startIndex); stopIndex= htmlDocuemtnt1.indexOf(strTemp2, startIndex); System.out.println("==========---------"+stopIndex); if(startIndex!=-1 && stopIndex!=-1){ String companyType=htmlDocuemtnt1.substring(startI

48、ndex,stopIndex); System.out.println("@@@@@--------"+companyType); System.out.println("@@@@@--------"+companyType.indexOf("\"")); companyType=companyType.substring(0,companyType.indexOf("\"")); System.out.println("#####--------"+companyType); listNewsType.add(companyType

49、); } if(dd>10){ break; } if(stopIndex==-1 || startIndex==-1){ break; } } System.out.println("listCompanyType====="+listNewsType.size()); /** 推薦精選 * 抓取二級URl 結(jié)束 ********************/ /******************** * 抓取頁面內(nèi)容開始 */

50、 String title=""; String hometext=""; String bodytext=""; String keywords=""; String counter = "221"; String cdate= ""; int begainIndex=0;//檢索字符串的起點索引 int endIndex=0;//檢索字符串的終點索引 String begainStr;//檢索開始字符串 String endStr;//檢索結(jié)束字符串 for

51、 (int rows = 1; rows < listNewsType.size(); rows++) { String strNewsDetail = listNewsType.get(rows).toString(); System.out.println("strNewsDetail====="+strNewsDetail); if(strNewsDetail != null && strNewsDetail.length() > 0){ WebHtml newsListHtml = new WebHtml(); String ht

52、mlDocuemtntCom = newsListHtml.getWebHtml(strNewsDetail); System.out.println("$$$$$------"+htmlDocuemtntCom); if(htmlDocuemtntCom == null || htmlDocuemtntCom.length() == 0){ return; } //截取時間 int dateBegainIndex = htmlDocuemtntCom.indexOf("

時間：");

53、 System.out.println("%%%%%--"+dateBegainIndex); String newTime = htmlDocuemtntCom.substring(dateBegainIndex,dateBegainIndex+20); System.out.println("^^^^^^^^^^^^^^^---"+newTime); String newTimeM = newTime.substring(newTime.lastIndexOf("-")+1,newTime.lastIndexOf("-")+3); S

54、tring dateM = DateNews.substring(DateNews.lastIndexOf("-")+1); System.out.println("^^^^^^^^^^^^^^^---"+newTimeM); System.out.println("^^^^^^^^^^^^^^^---"+dateM); if(newTimeM == dateM || newTimeM.equals(dateM)){ //檢索新聞標(biāo)題 begainStr="

55、推薦精選 endStr="

時間："; begainIndex=htmlDocuemtntCom.indexOf(begainStr,0); System.out.println("&&&&&&------"+begainIndex); endIndex=htmlDocuemtntCom.indexOf(endStr,0); System.out.println("&&&&&&------"+endIndex); if(begainIndex!=-1 && endIndex!=

56、-1){ title = htmlDocuemtntCom.substring(begainIndex,endIndex).trim(); title = title.substring(title.indexOf("

")+4,title.indexOf("

")); title = title.replace("'", ""); title = title.replace(";", ""); title = title.replace(" ", ""); }

57、 //檢索新聞內(nèi)容 begainStr="

"; endStr=""; begainIndex=htmlDocuemtntCom.indexOf(begainStr,0); endIndex=htmlDocuemtntCom.indexOf(endStr,0); if(begainIndex!=-1 && endIndex!=-1){ bodytext = htmlDocuemtntCom.subs

58、tring(begainIndex,endIndex).trim(); if(bodytext.indexOf("

")>0 && bodytext.indexOf("

")>bodytext.indexOf("

") && bodytext.indexOf("

")>0) bodytext = bodytext.substring(bodytext.indexOf("

")+3,bodytext.indexOf("

")); bodytext=bodytext.replace(" ", "");

59、 bodytext=bodytext.replace("
", ""); bodytext=bodytext.replace("\n", "
"); bodytext=bodytext.replace("'", ""); bodytext=bodytext.replace(";", ""); } //簡介 if(bodytext.length()>40) hometext = bodytext.substring(0,40)+"......"; else{

60、 hometext = bodytext+"......"; } //瀏覽量 String str = String.valueOf(Math.random()); counter = str.substring(str.lastIndexOf(".")+1,5); Calendar cal = Calendar.getInstance(); cal.setTime(new Date()); cdate = cal.getTimeInMillis()+""; 推薦精

61、選 cdate = cdate.substring(0,10); }else{ continue; } } System.out.println("-------------------------"+title); System.out.println("-------------------------"+cdate); System.out.println("-------------------------"+cdate); System.out.println("--------

62、-----------------"+hometext); System.out.println("-------------------------"+bodytext); System.out.println("-------------------------"+keywords); System.out.println("-------------------------"+counter); /*String str = "INSERT INTO ecim_stories(uid,title,created,published,hostname

63、,hometext,bodytext,keywords,counter,topicid,ihome,notifypub,story_type,topicdisplay,topicalign,comments,rating,votes,description) "; str += "VALUE (1,'"+title+"',"+cdate+","+cdate+",'125.122.83.177','"+hometext+"','"+bodytext+"','"+keywords+"',"+counter+",1,0,1,'admin',0,'R',0,0,0,'')"; DB

64、Access db = new DBAccess();; if(db.executeUpdate(str)>0) { System.out.println("-------------------------成功?。。。。。。。。?！"); }else { System.out.println("-------------------------失?。。。。。。。。。?！"); }*/ } /** * 抓取頁面內(nèi)容結(jié)束 ********************/ } } package com.news.spider; import .URL; import .URLConnection; import java.io.BufferedReader; import java.io.InputStreamReader; public class WebHtml { /** * 根據(jù)url,抓取webhmtl內(nèi)容 * @param url 推薦精選 */ public String getWebHtml(String url){ try { UR

展開閱讀全文

溫馨提示:
1: 本站所有資源如無特殊說明，都需要本地電腦安裝OFFICE2007和PDF閱讀器。圖紙軟件為CAD,CAXA,PROE,UG,SolidWorks等.壓縮文件請下載最新的WinRAR軟件解壓。
2: 本站的文檔不包含任何第三方提供的附件圖紙等，如果需要附件，請聯(lián)系上傳者。文件的所有權(quán)益歸上傳用戶所有。
3.本站RAR壓縮包中若帶圖紙，網(wǎng)頁內(nèi)容里面會有圖紙預(yù)覽，若沒有圖紙預(yù)覽就沒有圖紙。
4. 未經(jīng)權(quán)益所有人同意不得將文件中的內(nèi)容挪作商業(yè)或盈利用途。
5. 裝配圖網(wǎng)僅提供信息存儲空間，僅對用戶上傳內(nèi)容的表現(xiàn)方式做保護處理，對用戶上傳分享的文檔內(nèi)容本身不做任何修改或編輯，并不能對任何下載內(nèi)容負(fù)責(zé)。
6. 下載文件中如有侵權(quán)或不適當(dāng)內(nèi)容，請與我們聯(lián)系，我們立即糾正。
7. 本站不保證下載資源的準(zhǔn)確性、安全性和完整性, 同時也不承擔(dān)用戶因使用這些下載資源對自己和他人造成任何形式的傷害或損失。

點擊下載此資源

java開發(fā)項目集錦(附源碼)Word版

")+4,title.indexOf("

最新文檔

相關(guān)資源

相關(guān)搜索