over 2 years ago

讓操作 NoSQL 像 ORM 般簡單,這邊是用 Elasticsearch 當儲存,並用 jsoup 來爬文

http://docs.spring.io/spring-data/elasticsearch/docs/current/reference/html/

專案配置

build.gradle
buildscript {
    repositories {
        mavenCentral()
    }
    dependencies {
        classpath('org.springframework.boot:spring-boot-gradle-plugin:1.2.5.RELEASE')
    }
}

apply plugin: 'java'
apply plugin: 'eclipse'
apply plugin: 'spring-boot'

sourceCompatibility = 1.8
targetCompatibility = 1.8
version = '1.0'

repositories {
    mavenCentral()
}

dependencies {
    compile 'org.jsoup:jsoup:1.8.2'
    compile 'org.springframework.boot:spring-boot-starter:1.2.5.RELEASE',
            'org.springframework.boot:spring-boot-starter-data-elasticsearch:1.2.5.RELEASE',
            'org.springframework.boot:spring-boot-starter-test:1.2.5.RELEASE',
            'org.springframework.boot:spring-boot-starter-web:1.2.4.RELEASE'
            
    runtime 'org.projectlombok:lombok:1.16.4'

    testCompile group: 'junit', name: 'junit', version: '4.+'
}

儲存體

News.java
package com.news.entities;

import java.util.Date;
import java.util.List;

import org.joda.time.DateTime;
import org.springframework.data.annotation.CreatedDate;
import org.springframework.data.annotation.Id;
import org.springframework.data.annotation.Version;
import org.springframework.data.elasticsearch.annotations.DateFormat;
import org.springframework.data.elasticsearch.annotations.Document;
import org.springframework.data.elasticsearch.annotations.Field;
import org.springframework.data.elasticsearch.annotations.FieldIndex;
import org.springframework.data.elasticsearch.annotations.FieldType;

import com.fasterxml.jackson.annotation.JsonFormat;

import lombok.Data;

@Data
@Document(indexName = "mitakev1", type = "news", shards = 10, replicas = 0, refreshInterval = "-1")
public class News {
    @Id
    private String id;
    @Version
    private Long version;
    private String postdate;
    private String link;
    private String linktext;
    private String content;
    @Field(type= FieldType.Nested, index = FieldIndex.not_analyzed)
    private List<Stock> relatedStocks;
    @Field(type= FieldType.String, index = FieldIndex.not_analyzed)
    private List<String> relatedIndustries;
    
    @JsonFormat(shape = JsonFormat.Shape.STRING, pattern = "yyyyMMdd'T'HHmmss.SSS'Z'")
    @Field(type = FieldType.Date, format = DateFormat.basic_date_time, index = FieldIndex.not_analyzed)
    @CreatedDate
    private Date createdDateTime;
}

持久化操作

NewsRepository.java
package com.news.repository;

import java.util.List;
import org.springframework.data.elasticsearch.repository.ElasticsearchRepository;
import com.news.entities.News;

public interface NewsRepository extends ElasticsearchRepository<News, String>{
    public List <News> findByContent(String content);
}

服務類別

NewsService.java
package com.news.service;

import java.io.IOException;
import java.time.LocalDate;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;

import com.news.entities.News;
import com.news.entities.Stock;
import com.news.repository.NewsRepository;

@Service
public class NewsService {
    @Autowired
    private NewsRepository repository;
    
    public Iterable<News> findAll(){
        return repository.findAll();
    }
    
    public void deleteAll(){
        this.repository.deleteAll();
    }
    
    public void crawler(String start,String end){
        //LocalDate today = LocalDate.now();

        LocalDate startDate = LocalDate.of(Integer.parseInt(start.substring(0, 4)), Integer.parseInt(start.substring(4, 6)), Integer.parseInt(start.substring(6, 8)));
        LocalDate endDate = LocalDate.of(Integer.parseInt(end.substring(0, 4)), Integer.parseInt(end.substring(4, 6)), Integer.parseInt(end.substring(6, 8)));
        while(startDate.isBefore(endDate)){
            List<News> newslist = getNewsList(startDate.toString());
            System.out.println("date=" + startDate + ", newslist.size=" + newslist.size());
            for(News news:newslist){
                this.getNewsContent(news);
                this.repository.save(news);
            }
            startDate = startDate.plusDays(1L);
        }
    }
    
    private List<News> getNewsList(String date){
        News news = null;
        List<News> newslist = new ArrayList();
        try {
            Document doc = Jsoup.connect(String.format("http://fund.bot.com.tw/Z/ZF/ZF_H_%s.djhtm", date)).get();
            Elements resultLinks = doc.select("table :gt(0) > td > a");
            for(Element el:resultLinks){
                news = new News();
                news.setLink(el.attr("href"));
                news.setLinktext(el.text());
                news.setCreatedDateTime(new Date());
                newslist.add(news);
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
        return newslist;
    }

    private void getNewsContent(News news){
        Document doc;
        try {
            doc = Jsoup.connect(String.format("http://fund.bot.com.tw/%s", news.getLink())).get();
            //時間 (92/06/23 16:20:43)

            Element datetime = doc.select("table :gt(0) div.p01").first();
            //System.out.println(datetime.text().trim());

            //內容

            Element content = doc.select("table :gt(0) td.p1").first();
            String contenttext = content.text().replaceAll(" ", "");
            news.setContent(contenttext);

            //•相關個股:  1736喬山 •相關產業:  生物科技

            List relatedStocks = new ArrayList();
            List relatedIndustries = new ArrayList();
            Elements relateds = doc.select("table :gt(0) td.p2");
            for(Element el:relateds){
                String text = el.text().replaceAll(" ", "");
                if(text.startsWith("•相關個股")){
                    String[] relatedsstocktext = text.split(":")[1].split(",");
                    for(String stocktext:relatedsstocktext){
                        Stock stock = new Stock();
                        stock.setStockid(stocktext.substring(0, 4));
                        stock.setStockname(stocktext.substring(4));
                        relatedStocks.add(stock);
                    }
                }else{
                    //•相關產業

                    relatedIndustries.add(text.split(":")[1]);
                }
            }
            news.setRelatedStocks(relatedStocks);
            news.setRelatedIndustries(relatedIndustries);
        } catch (Exception e) {
            // TODO Auto-generated catch block

            e.printStackTrace();
        }
    }
}

再來最重要的啟動類別

Application.java
package com.news.app;

import com.news.service.*;

import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.CommandLineRunner;
import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.context.annotation.ComponentScan;
import org.springframework.data.elasticsearch.repository.config.EnableElasticsearchRepositories;

@SpringBootApplication
@EnableElasticsearchRepositories(basePackages = {"com.news.repository"})
@ComponentScan(basePackages="com.news")
public class Application implements CommandLineRunner{
    
    @Autowired
    private NewsService NewsService;
  
    public static void main(String[] args) throws Exception {
        SpringApplication.run(Application.class, "--debug");
    }
    @Override
    public void run(String... args) throws Exception {
        NewsService.crawler("20030623", "20150722");
    }
}

需要EnableElasticsearchRepositories並告訴你的持久介面放在哪

配製參數檔

application.yml
spring.data.elasticsearch.cluster-nodes : localhost:9300

spring.data.elasticsearch.repositories.enabled : true

啟動後就可以透過Controller執行爬文

curl -X GET -H "Content-Type: application/json" -H "Cache-Control: no-cache" 'http://localhost:8080/api/v1/newscrawler?datestart=20150701&dateend=20150702'

取得所有文章

curl -X GET -H "Content-Type: application/json" -H "Cache-Control: no-cache" 'http://localhost:8080/api/v1/news'

範例程式
GitHub

← JDK8 Date Time 使用Spring-Batch進行排程工作 →
 
comments powered by Disqus