over 7 years ago
讓操作 NoSQL 像 ORM 般簡單,這邊是用 Elasticsearch 當儲存,並用 jsoup 來爬文
http://docs.spring.io/spring-data/elasticsearch/docs/current/reference/html/
專案配置
buildscript {
repositories {
mavenCentral()
}
dependencies {
classpath('org.springframework.boot:spring-boot-gradle-plugin:1.2.5.RELEASE')
}
}
apply plugin: 'java'
apply plugin: 'eclipse'
apply plugin: 'spring-boot'
sourceCompatibility = 1.8
targetCompatibility = 1.8
version = '1.0'
repositories {
mavenCentral()
}
dependencies {
compile 'org.jsoup:jsoup:1.8.2'
compile 'org.springframework.boot:spring-boot-starter:1.2.5.RELEASE',
'org.springframework.boot:spring-boot-starter-data-elasticsearch:1.2.5.RELEASE',
'org.springframework.boot:spring-boot-starter-test:1.2.5.RELEASE',
'org.springframework.boot:spring-boot-starter-web:1.2.4.RELEASE'
runtime 'org.projectlombok:lombok:1.16.4'
testCompile group: 'junit', name: 'junit', version: '4.+'
}
儲存體
package com.news.entities;
import java.util.Date;
import java.util.List;
import org.joda.time.DateTime;
import org.springframework.data.annotation.CreatedDate;
import org.springframework.data.annotation.Id;
import org.springframework.data.annotation.Version;
import org.springframework.data.elasticsearch.annotations.DateFormat;
import org.springframework.data.elasticsearch.annotations.Document;
import org.springframework.data.elasticsearch.annotations.Field;
import org.springframework.data.elasticsearch.annotations.FieldIndex;
import org.springframework.data.elasticsearch.annotations.FieldType;
import com.fasterxml.jackson.annotation.JsonFormat;
import lombok.Data;
@Data
@Document(indexName = "mitakev1", type = "news", shards = 10, replicas = 0, refreshInterval = "-1")
public class News {
@Id
private String id;
@Version
private Long version;
private String postdate;
private String link;
private String linktext;
private String content;
@Field(type= FieldType.Nested, index = FieldIndex.not_analyzed)
private List<Stock> relatedStocks;
@Field(type= FieldType.String, index = FieldIndex.not_analyzed)
private List<String> relatedIndustries;
@JsonFormat(shape = JsonFormat.Shape.STRING, pattern = "yyyyMMdd'T'HHmmss.SSS'Z'")
@Field(type = FieldType.Date, format = DateFormat.basic_date_time, index = FieldIndex.not_analyzed)
@CreatedDate
private Date createdDateTime;
}
持久化操作
package com.news.repository;
import java.util.List;
import org.springframework.data.elasticsearch.repository.ElasticsearchRepository;
import com.news.entities.News;
public interface NewsRepository extends ElasticsearchRepository<News, String>{
public List <News> findByContent(String content);
}
服務類別
package com.news.service;
import java.io.IOException;
import java.time.LocalDate;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import com.news.entities.News;
import com.news.entities.Stock;
import com.news.repository.NewsRepository;
@Service
public class NewsService {
@Autowired
private NewsRepository repository;
public Iterable<News> findAll(){
return repository.findAll();
}
public void deleteAll(){
this.repository.deleteAll();
}
public void crawler(String start,String end){
//LocalDate today = LocalDate.now();
LocalDate startDate = LocalDate.of(Integer.parseInt(start.substring(0, 4)), Integer.parseInt(start.substring(4, 6)), Integer.parseInt(start.substring(6, 8)));
LocalDate endDate = LocalDate.of(Integer.parseInt(end.substring(0, 4)), Integer.parseInt(end.substring(4, 6)), Integer.parseInt(end.substring(6, 8)));
while(startDate.isBefore(endDate)){
List<News> newslist = getNewsList(startDate.toString());
System.out.println("date=" + startDate + ", newslist.size=" + newslist.size());
for(News news:newslist){
this.getNewsContent(news);
this.repository.save(news);
}
startDate = startDate.plusDays(1L);
}
}
private List<News> getNewsList(String date){
News news = null;
List<News> newslist = new ArrayList();
try {
Document doc = Jsoup.connect(String.format("http://fund.bot.com.tw/Z/ZF/ZF_H_%s.djhtm", date)).get();
Elements resultLinks = doc.select("table :gt(0) > td > a");
for(Element el:resultLinks){
news = new News();
news.setLink(el.attr("href"));
news.setLinktext(el.text());
news.setCreatedDateTime(new Date());
newslist.add(news);
}
} catch (IOException e) {
e.printStackTrace();
}
return newslist;
}
private void getNewsContent(News news){
Document doc;
try {
doc = Jsoup.connect(String.format("http://fund.bot.com.tw/%s", news.getLink())).get();
//時間 (92/06/23 16:20:43)
Element datetime = doc.select("table :gt(0) div.p01").first();
//System.out.println(datetime.text().trim());
//內容
Element content = doc.select("table :gt(0) td.p1").first();
String contenttext = content.text().replaceAll(" ", "");
news.setContent(contenttext);
//•相關個股: 1736喬山 •相關產業: 生物科技
List relatedStocks = new ArrayList();
List relatedIndustries = new ArrayList();
Elements relateds = doc.select("table :gt(0) td.p2");
for(Element el:relateds){
String text = el.text().replaceAll(" ", "");
if(text.startsWith("•相關個股")){
String[] relatedsstocktext = text.split(":")[1].split(",");
for(String stocktext:relatedsstocktext){
Stock stock = new Stock();
stock.setStockid(stocktext.substring(0, 4));
stock.setStockname(stocktext.substring(4));
relatedStocks.add(stock);
}
}else{
//•相關產業
relatedIndustries.add(text.split(":")[1]);
}
}
news.setRelatedStocks(relatedStocks);
news.setRelatedIndustries(relatedIndustries);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
再來最重要的啟動類別
package com.news.app;
import com.news.service.*;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.CommandLineRunner;
import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.context.annotation.ComponentScan;
import org.springframework.data.elasticsearch.repository.config.EnableElasticsearchRepositories;
@SpringBootApplication
@EnableElasticsearchRepositories(basePackages = {"com.news.repository"})
@ComponentScan(basePackages="com.news")
public class Application implements CommandLineRunner{
@Autowired
private NewsService NewsService;
public static void main(String[] args) throws Exception {
SpringApplication.run(Application.class, "--debug");
}
@Override
public void run(String... args) throws Exception {
NewsService.crawler("20030623", "20150722");
}
}
需要EnableElasticsearchRepositories並告訴你的持久介面放在哪
配製參數檔
spring.data.elasticsearch.cluster-nodes : localhost:9300
spring.data.elasticsearch.repositories.enabled : true
啟動後就可以透過Controller執行爬文
curl -X GET -H "Content-Type: application/json" -H "Cache-Control: no-cache" 'http://localhost:8080/api/v1/newscrawler?datestart=20150701&dateend=20150702'
取得所有文章
curl -X GET -H "Content-Type: application/json" -H "Cache-Control: no-cache" 'http://localhost:8080/api/v1/news'
範例程式
GitHub