今天写了一个爬虫跟大家分享一下,该爬虫为简单爬虫,后续会跟大家分享难一些的爬虫,话不多说,直接上代码。如果有疑问,可以直接评论。。。。。
package com.analysis;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import com.dao.FriendLinkDao;
public class SnatchSHUJUJU {
public static Document getDocument (String url){
try {
//5000是设置连接超时时间,单位ms
return Jsoup.connect(url).timeout(5000).get();
} catch (IOException e) {
e.printStackTrace();
}
return null;
}
public static List<String> getEveryOtherUrl(){
List<String> urlList = new ArrayList<>();
String host = "http://www.shujuju.cn";
String url = "http://www.shujuju.cn/navigation/navigationPage";
Document document = getDocument(url);
Elements elements1 = document.select("[class=more fr]");
Elements elements2 = elements1.select("a[href]");
for(Element element : elements2){
String string = host+element.attr("href");
urlList.add(string);
}
return urlList;
}
public static List<Map> getDetailUrl(List<String> list){
List <Map> mapList = new ArrayList<>();
for(String url:list){
Document document = getDocument(url);
Elements elements1 = document.select("[class=nav-sort-info]");
String channelName = elements1.get(0).select("h4").text();
System.out.println("channelName:"+channelName);
Elements elements2 = elements1.select("[class=nav-sort-body clearfix]").select("a");
for(Element element : elements2){
Map<String,String> map = new HashMap<>();
String linkUrl = element.attr("href");
String name = element.text();
System.out.println("linkUrl:"+linkUrl);
System.out.println("name:"+name);
map.put("channelName", channelName);
map.put("linkUrl", linkUrl);
map.put("name", name);
mapList.add(map);
}
}
return mapList;
}
public static void main(String[] args) {
List<Map> list = getDetailUrl(getEveryOtherUrl());
FriendLinkDao friendDao = new FriendLinkDao();
for(Map map:list){
String channelName = map.get("channelName").toString();
Integer channelId = friendDao.getChannelId(channelName);
if(channelId != -1){
System.out.println("channelId: " + channelId);
map.put("channelId", channelId);
map.put("stat", "1");
friendDao.insertFriendLink(map);
}else {
friendDao.insertChannelName(channelName, 1);
channelId = friendDao.getChannelId(channelName);
System.out.println("channelId: " + channelId);
map.put("channelId", channelId);
map.put("stat", "1");
friendDao.insertFriendLink(map);
}
}
}
}
package com.dao;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Timestamp;
import java.text.SimpleDateFormat;
import java.util.HashSet;
import java.util.Date;
import java.util.Map;
import java.util.Set;
import com.util.ConnectUtil;
public class FriendLinkDao {
public Connection conn = ConnectUtil.getConn();
public Integer getChannelId(String channelName) {
Integer id = -1;
try {
String sql = "SELECT id FROM t_zsff_friend_link_channel WHERE channel_name = ?";
PreparedStatement ptmt = conn.prepareStatement(sql);
ptmt.setString(1, channelName);
ResultSet rs = ptmt.executeQuery();
while (rs.next()) {
id = rs.getInt("id");
}
return id;
} catch (SQLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
return id; // 返回-1,数据库插入异常
}
}
public void insertChannelName(String channelName,Integer pid) {
String sql = "INSERT INTO t_zsff_friend_link_channel (channel_name, pid) VALUES (?, ?)";
try {
PreparedStatement ptmt = conn.prepareStatement(sql);
ptmt.setObject(1, channelName);
ptmt.setObject(2, pid);
ptmt.executeUpdate();
} catch (Exception e) {
e.printStackTrace();// TODO: handle exception
}
}
public void insertFriendLink(Map map) {
String sql = "INSERT INTO t_zsff_friend_link (name, channel_id, link_url, stat) VALUES (?, ?, ?, ?)";
try {
PreparedStatement ptmt = conn.prepareStatement(sql);
ptmt.setObject(1, map.get("name"));
ptmt.setObject(2, map.get("channelId"));
ptmt.setObject(3, map.get("linkUrl"));
ptmt.setObject(4, map.get("stat"));
ptmt.executeUpdate();
} catch (Exception e) {
e.printStackTrace();// TODO: handle exception
}
}
}
package com.util;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.SQLException;
public class ConnectUtil {
private static Connection conn;
public static Connection getConn() {
try {
//1.加载mysql连接到数据库jar包,数据库驱动
Class.forName("com.mysql.jdbc.Driver");
//2.数据库所在位置以及要访问数据库的名字
String url = "jdbc:mysql://127.0.0.7:3306/test?characterEncoding=UTF-8";
//3.数据库的用户名,密码
String username = "root";
String password = "root";
//4.使用驱动管理器连接到数据库
conn = DriverManager.getConnection(url,username,password);
} catch (ClassNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (SQLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return conn;
}
public void setConn(Connection conn1) {
conn = conn1;
}
}
<dependency>
<!-- jsoup HTML parser library @ https://jsoup.org/ -->
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.13.1</version>
</dependency>
版权声明:本文内容由互联网用户自发贡献,该文观点仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 举报,一经查实,本站将立刻删除。
文章由极客之音整理,本文链接:https://www.bmabk.com/index.php/post/71469.html