学到的东西

jsoup

connect与get方法
element 与 elements 方法
element 中的 attr 与 text 方法
获取 element 与 elments 的 select 方法
outPutStream

getBytes 方法
write 方法
其他

字符串的 replace 方法
jar包
使用launch4j包了一个应用程序
代码

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.*;
import java.util.Scanner;

public class Spider {
    public static void main(String[] args) throws IOException {
        String menuUrl = "https://www.aixiaxsw.com/61/61977/";
        final String fileAddr = "./";
        Scanner scanner = new Scanner(System.in);
        while (true){
            System.out.println("输入1则根据下次输入的目录爬取,输入2则爬取示例小说"
            +"\n注意,网址必须以 / 结尾!!!");
            if(scanner.nextInt()==1){
                menuUrl = scanner.next();
                if(menuUrl.indexOf("https://www.aixiaxsw.com/)")==0){
                    break;
                }
                else{
                    System.out.println("请重新输入以 www.xbiquge.so/book/ 开头的网址!!!");
                }
            }
            else{
                break;
            }
        }
        //定义document变量
        Document document = null;
        //获取网页内容(html文件)
        try {
            document = Jsoup.connect(menuUrl).get();
        } catch (IOException e){
            e.printStackTrace();
        }
        if(document==null){
            System.out.println("当前网络不佳!!\n或者您未输入正确格式的网址!");
            System.exit(0);
        }
        //从<body>中获取h1标题
        String title = document.body().selectFirst("h1").text();
        //selectFirst:只找第一个
        System.out.println("开始爬取:" + title);
        //从<body>中获取dl与dd中内容
        Elements menu = document.body().select("dd");
        //获取有链接属性的a标签
        Elements as = menu.select("a[href]");
        //as:许多a标签(有href属性的a标签)
        System.out.println("小说将保存在:"+fileAddr+title+".md 中");
        //新建file文件
        File file = new File(fileAddr+title+".md");
        //输出流
        OutputStream fileOut = null;
        try{
            //定义为文件输出流
            fileOut = new FileOutputStream(file);
        }catch (FileNotFoundException ew){
            ew.printStackTrace();
        }
        // 在向文件中输出每一章节前,先输出小说名
        try {
            fileOut.write(("# "+title+"\n\n").getBytes());
        }
        catch (NullPointerException e){
            e.printStackTrace();
        }

            int count = 1;
        String result = "";
            //循环操作每个章节
            //跳过前12个最新章节
            for (Element a : as){
                if(count<=9){
                    count++;
                    continue;
                }

                String subLink = a.attr("href");
                //获得a标签中的href(attr:属性)
                String chapterName = a.text();
                //获得a标签中的文本
                System.out.println("当前爬取章节:"+chapterName);
                //定义空文档
                Document chapter = null;
                //获得每一章的html文件
                try {
                    chapter = Jsoup.connect("https://www.aixiaxsw.com"+subLink).timeout(10000).get();
                }catch(IOException e){
                    e.printStackTrace();
                }
                if(chapter==null){
                    System.out.println("当前网络不佳!!");
                    break;
                }
                //在每一章的HTML文件中找到ID为content的标签内部内容
                try {Element chapterContent = chapter.selectFirst("#content");
                    chapterContent.text().replace("*",".");
                    result = chapterContent.text().replace("。","。\n\n");
                    result = result.replace("!","!\n\n");
                    result = result.replace("?","?\n\n");
                    //写入数据流并写入文件中
                    fileOut.write(("## " + chapterName + "  [原文链接](" + "https://www.aixiaxsw.com" + subLink + ")\n\n" +
                            result + "\n\n").getBytes());}
                catch (NullPointerException e){
                    e.printStackTrace();
                }

            }
        System.out.println("文件传输完成!");
            fileOut.close();
    }
}
龚佳轩
软件园学生在线

[后端二] 龚佳轩

学到的东西

jsoup

outPutStream

其他

代码