爬取存入数据库
/**
* 京东爬虫依赖
*
* <dependency>
* <groupId>org.jsoup</groupId>
* <artifactId>jsoup</artifactId>
* <version>1.11.3</version>
* </dependency>
* <p>
* 爬取京东品牌和分类信息
*/
@RestController
public class DemoController {
@Autowired
CategoryService categoryService;
@Autowired
BrandService brandService;
@GetMapping("getCategoryFromJD")
public AxiosResult<Void> setData() throws IOException {
Document document = Jsoup.connect("https://www.jd.com/allSort.aspx").get();
Elements elementsByClass = document.getElementsByClass("category-items");
for (int i = 0; i < elementsByClass.size(); i++) {
Element element = elementsByClass.get(i);
Elements element1 = element.getElementsByClass("category-item");
for (int j = 0; j < element1.size(); j++) {
Element element2 = element1.get(j);
//一级分类名
String firstCategoryName = element2.getElementsByTag("span").text();
Category firstCategory = new Category();
firstCategory.setCatetoryName(firstCategoryName);
firstCategory.setCategoryLevel(1);
firstCategory.setParentId(0L);
categoryService.save(firstCategory);
//二级分类
Elements dt = element2.getElementsByTag("dl");
for (int k = 0; k < dt.size(); k++) {
Element element3 = dt.get(k);
String secondCategoryName = element3.getElementsByTag("dt").get(0).getElementsByTag("a").get(0).text();
Category sencondCategory = new Category();
sencondCategory.setParentId(firstCategory.getId());
sencondCategory.setCatetoryName(secondCategoryName);
sencondCategory.setCategoryLevel(2);
categoryService.save(sencondCategory);
Elements a = element3.getElementsByTag("dd").get(0).getElementsByTag("a");
for (int l = 0; l < a.size(); l++) {
Category threeCategory = new Category();
threeCategory.setParentId(sencondCategory.getId());
threeCategory.setCatetoryName(a.get(l).text());
threeCategory.setCategoryLevel(3);
categoryService.save(threeCategory);
}
}
}
}
return AxiosResult.success();
}
@GetMapping("getBrandFromJD")
public AxiosResult<Void> getBrandFromJd() throws Exception {
Document document = Jsoup.connect("https://www.jd.com/brand.aspx").get();
Elements brandslist = document.getElementsByClass("brandslist");
for (int i = 0; i < brandslist.size(); i++) {
Element element1 = brandslist.get(i);
Elements li = element1.getElementsByTag("li");
for (int j = 0; j < li.size(); j++) {
Element img = li.get(j).getElementsByTag("img").get(0);
System.out.println(img);
String src = img.attr("src");
String alt = img.attr("alt");
System.out.println(src);
Element span = li.get(j).getElementsByTag("span").get(1).getElementsByTag("a").get(0);
String text = span.text();
Brand brand = new Brand();
brand.setBrandName(text);
brand.setBrandDesc(alt);
brand.setBrandLogo(src);
brand.setBrandSite("http://www.baidu.com");
brandService.save(brand);
}
}
return AxiosResult.success();
}
}
爬取输出txt
public static void setData() throws IOException {
Document document = Jsoup.connect("https://www.jd.com/allSort.aspx").get();
Elements elementsByClass = document.getElementsByClass("category-items");
StringBuffer stringBuffer = new StringBuffer();
StringBuffer append = null;
for (int i = 0; i < elementsByClass.size(); i++) {
Element element = elementsByClass.get(i);
Elements element1 = element.getElementsByClass("category-item");
for (int j = 0; j < element1.size(); j++) {
Element element2 = element1.get(j);
//一级分类名
String firstCategoryName = element2.getElementsByTag("span").text();
//二级分类
Elements dt = element2.getElementsByTag("dl");
for (int k = 0; k < dt.size(); k++) {
Element element3 = dt.get(k);
String secondCategoryName = element3.getElementsByTag("dt").get(0).getElementsByTag("a").get(0).text();
Elements a = element3.getElementsByTag("dd").get(0).getElementsByTag("a");
for (int l = 0; l < a.size(); l++) {
System.out.print(a.get(l).text() + " ");
String text = a.get(l).text();
append = stringBuffer.append(text + " ");
}
System.out.println("\n");
append.append("\r\n");
}
}
}
FileOutputStream stream = new FileOutputStream("C://Users//Desktop//京东分类目录.txt");
byte[] bytes = append.toString().getBytes(StandardCharsets.UTF_8);
stream.write(bytes);
stream.close();
}
版权声明:本文内容由互联网用户自发贡献,该文观点仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 举报,一经查实,本站将立刻删除。
文章由极客之音整理,本文链接:https://www.bmabk.com/index.php/post/192885.html