代码拉取完成,页面将自动刷新
高校数据爬取,清洗,建模,实现系统;相关技术:Java爬虫,MySQL数据处理,SSM
1.2.1 利其器(搭框架)
1.2.2 通过Json爬取(未加密)
(1) 进入官网:https://gkcx.eol.cn/school/search 。这个表中数据是我想要的。
(2) 打开开发者工具。这个json数据是我想要的。
(3) 分析Json地址:https://api.eol.cn/gkcx/api/?access_token=&admissions=¢ral=&department=&dual_class=&f211=&f985=&is_dual_class=&keyword=&page=1&province_id=&request_type=1&school_type=&signsafe=&size=20&sort=view_total&type=&uri=apigkcx/api/school/hotlists
简化链接: https://api.eol.cn/gkcx/api/?page=1&school_type=&size=20&type=&uri=apigkcx/api/school/hotlists)
查看json结构:用于在Java代码中解析json数据
(4)确定了我们要的数据,如下:
id;/*编号*/
school_id;/*学校编号*/
name;/*学校名称*/
rank;/*全国热度排名*/
rank_type;/*类别热度排名*/
level_name;/*办学类型*/
belong;/*附属院校*/
city_name;/*所在城市*/
county_name;/*所在区域*/
province_name;/*所在省*/
nature_name;/*院校类型*/
(5)编码,爬取、存储
工程结构:
Maven
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>org.example</groupId>
<artifactId>crawler_college</artifactId>
<version>1.0-SNAPSHOT</version>
<parent>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-parent</artifactId>
<version>2.2.4.RELEASE</version>
</parent>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.encoding>UTF-8</maven.compiler.encoding>
<java.version>1.8</java.version>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
</properties>
<packaging>jar</packaging>
<dependencies>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<exclusions>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
</exclusion>
</exclusions>
<version>0.7.3</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.7.3</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.13</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-java</artifactId>
<version>3.141.59</version>
</dependency>
<dependency>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
<version>1.2.17</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
<version>2.10.2</version>
</dependency>
<dependency>
<groupId>jakarta.persistence</groupId>
<artifactId>jakarta.persistence-api</artifactId>
<version>2.2.3</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.9</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.48</version>
</dependency>
<!--SpringMVC-->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
<scope>compile</scope>
</dependency>
<!--SpringData Jpa-->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-data-jpa</artifactId>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-dbcp2</artifactId>
<version>2.7.0</version>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-test</artifactId>
<version>5.2.3.RELEASE</version>
<scope>compile</scope>
</dependency>
</dependencies>
</project>
#DB Configuration:
spring.datasource.driverClassName=com.mysql.jdbc.Driver
spring.datasource.url=jdbc:mysql://127.0.0.1:3306/colleges
spring.datasource.username=root
spring.datasource.password=123456
spring.datasource.type=org.apache.commons.dbcp2.BasicDataSource
spring.datasource.dbcp2.max-wait-millis=60000
spring.datasource.dbcp2.min-idle=20
spring.datasource.dbcp2.initial-size=2
spring.datasource.dbcp2.connection-properties=characterEncoding=utf8
spring.datasource.dbcp2.validation-query=SELECT 1
spring.datasource.dbcp2.test-while-idle=true
spring.datasource.dbcp2.test-on-borrow=true
spring.datasource.dbcp2.test-on-return=false
#JPA Configuration:
spring.jpa.database=MySQL
spring.jpa.show-sql=true
spring.jpa.hibernate.naming.implicit-strategy=org.springframework.boot.orm.jpa.hibernate.SpringImplicitNamingStrategy
spring.jpa.hibernate.naming.physical-strategy=org.springframework.boot.orm.jpa.hibernate.SpringPhysicalNamingStrategy
package com.fslq.dao;
import com.fslq.pojo.SchoolCond;
import org.springframework.data.jpa.repository.JpaRepository;
/*持久层*/
public interface SchoolCondDao extends JpaRepository<SchoolCond,Integer> {
}
package com.fslq.service;
import com.fslq.pojo.*;
import java.util.List;
import java.util.Set;
/*业务层*/
public interface SchoolService {
//查询所有学校ID
List<Integer> findAllSchoolId();
//保存SchoolCond
void saveAll(List<SchoolCond> lists);
//查询所有SchoolCond
List<SchoolCond> findAll(SchoolCond schoolCond);
}
package com.fslq.impl;
import com.fslq.dao.*;
import com.fslq.pojo.*;
import com.fslq.service.SchoolService;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.data.domain.Example;
import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Transactional;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
/*实现类*/
@Service
public class SchoolServiceImpl implements SchoolService
{
@Autowired
private SchoolCondDao schoolCondDao;
@Override
public List<Integer> findAllSchoolId() {
List<Integer> school_ids=new ArrayList<>( );
schoolCondDao.findAll( ).stream().forEach( p->{
school_ids.add( p.getSchool_id() );
} );
return school_ids;
}
@Override
@Transactional//注入事务
public void saveAll(List<SchoolCond> lists) {
schoolCondDao.saveAll( lists );
}
@Override
public List<SchoolCond> findAll(SchoolCond schoolCond) {
Example<SchoolCond> of=Example.of( schoolCond );
return schoolCondDao.findAll(of);
}
}
package com.fslq.pipeline;
import com.fasterxml.jackson.databind.JsonNode;
import com.fslq.pojo.SchoolCond;
import com.fslq.service.SchoolService;
import org.openqa.selenium.By;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
import org.openqa.selenium.support.ui.ExpectedConditions;
import org.openqa.selenium.support.ui.WebDriverWait;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;
import java.io.IOException;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import static com.fslq.task.AppProcessor.MAPPER;
import static java.lang.Thread.sleep;
/*Spring数据入库类*/
@Component
public class SchoolCondDataPipeline implements Pipeline {
@Autowired
private SchoolService schoolService;
@Override
public void process(ResultItems resultItems,Task task) {
//1.条件查询院校
//获取网页封装的实体类
List<SchoolCond> lists=null;
try{
lists=searchSchool();
}catch(IOException e){
e.printStackTrace();
}catch(InterruptedException e){
e.printStackTrace();
}
//实体信息不为空
if(lists!=null){
schoolService.saveAll( lists );
}
}
// 查学校:全部-综合-普通院校-全部
private List<SchoolCond> searchSchool() throws IOException, InterruptedException {
// 创建一个 chrome 的浏览器实例
WebDriver driver=new ChromeDriver( new ChromeOptions().addArguments( "--headless" ) );
List<SchoolCond> lists=new LinkedList<>();
try{
// 设置 chrome 的路径
//System.setProperty( "webdriver.chrome.driver","C:\\Users\\22966\\AppData\\Local\\Google\\Chrome\\Application\\chromedriver.exe" );
WebDriverWait wait=new WebDriverWait( driver,50 );
driver.get( "https://gkcx.eol.cn/school/search?recomschprop=%E7%BB%BC%E5%90%88&argschtype=%E6%99%AE%E9%80%9A%E6%9C%AC%E7%A7%91" );
//末页
wait.until( ExpectedConditions.presenceOfElementLocated( By.xpath( "/html/body/div[1]/div/div/div/div/div/div/div[3]/div[1]/div[2]/div[2]/ul/li[11]" ) ) ).click();
//尾页页码
int last_page_num=Integer.parseInt( wait.until( ExpectedConditions.presenceOfElementLocated( By.xpath( "//*[@id='root']/div/div/div/div/div/div/div[3]/div[1]/div[2]/div[2]/ul/li[9]" ) ) ).getText() );
//遍历每一页
do {
//GET PAGE CONTENT
String url="https://api.eol.cn/gkcx/api/?page="+last_page_num+"&school_type=6000&size=20&type=5000&uri=apigkcx/api/school/hotlists";
driver.get( url );
sleep( 500 );
String text=wait.until( ExpectedConditions.presenceOfElementLocated( By.xpath( "/html/body" ) ) ).getText();
//解析Json
JsonNode s=MAPPER.readTree( text ).get( "data" ).get( "item" );
//SAVE SQL
//存入Map结构
Map<String, Object> map=null;
for (JsonNode p : s) {
map=MAPPER.convertValue( p,Map.class );
//存入数据库对象实例
lists.add( saveSchoolCondInfo( map ) );
}
last_page_num--;
} while (last_page_num > 0);
}catch(Exception e){
System.out.println( e.getMessage() );
} finally {
//关闭浏览器(这个包括驱动完全退出,会清除内存),close 是只关闭浏览器
driver.quit();
return lists;
}
}
//封装数据
private SchoolCond saveSchoolCondInfo(Map<String, Object> map) {
SchoolCond schoolCond=new SchoolCond();
schoolCond.setSchool_id( (Integer) map.get( "school_id" ) );
schoolCond.setName( (String) map.get( "name" ) );
schoolCond.setRank( (Integer) map.get( "rank" ) );
schoolCond.setRank_type( (Integer) map.get( "rank_type" ) );
schoolCond.setLevel_name( (String) map.get( "level_name" ) );
schoolCond.setBelong( (String) map.get( "belong" ) );
schoolCond.setCity_name( (String) map.get( "city_name" ) );
schoolCond.setCounty_name( (String) map.get( "county_name" ) );
schoolCond.setProvince_name( (String) map.get( "province_name" ) );
schoolCond.setNature_name( (String) map.get( "nature_name" ) );
return schoolCond;
}
}
package com.fslq.pojo;
import javax.persistence.*;
/*条件查询高校数据库实体类*/
@Entity
@Table(name="school_cond")
public class SchoolCond {
@Id
@GeneratedValue(strategy = GenerationType.IDENTITY)
private Integer id;/*编号*/
private Integer school_id;/*学校编号*/
private String name;/*学校名称*/
private Integer rank;/*全国热度排名*/
private Integer rank_type;/*类别热度排名*/
private String level_name;/*办学类型*/
private String belong;/*附属院校*/
private String city_name;/*所在城市*/
private String county_name;/*所在区域*/
private String province_name;/*所在省*/
private String nature_name;/*院校类型*/
public Integer getId() {
return id;
}
public void setId(Integer id) {
this.id=id;
}
public Integer getSchool_id() {
return school_id;
}
public void setSchool_id(Integer school_id) {
this.school_id=school_id;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name=name;
}
public Integer getRank() {
return rank;
}
public void setRank(Integer rank) {
this.rank=rank;
}
public Integer getRank_type() {
return rank_type;
}
public void setRank_type(Integer rank_type) {
this.rank_type=rank_type;
}
public String getLevel_name() {
return level_name;
}
public void setLevel_name(String level_name) {
this.level_name=level_name;
}
public String getBelong() {
return belong;
}
public void setBelong(String belong) {
this.belong=belong;
}
public String getCity_name() {
return city_name;
}
public void setCity_name(String city_name) {
this.city_name=city_name;
}
public String getCounty_name() {
return county_name;
}
public void setCounty_name(String county_name) {
this.county_name=county_name;
}
public String getProvince_name() {
return province_name;
}
public void setProvince_name(String province_name) {
this.province_name=province_name;
}
public String getNature_name() {
return nature_name;
}
public void setNature_name(String nature_name) {
this.nature_name=nature_name;
}
@Override
public String toString() {
return "SchoolCond{"+
"id="+id+
", school_id="+school_id+
", name='"+name+'\''+
", rank="+rank+
", rank_type="+rank_type+
", level_name='"+level_name+'\''+
", belong='"+belong+'\''+
", city_name='"+city_name+'\''+
", county_name='"+county_name+'\''+
", province_name='"+province_name+'\''+
", nature_name='"+nature_name+'\''+
'}';
}
}
package com.fslq.task;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fslq.pipeline.*;
import com.fslq.pojo.SchoolCond;
import com.fslq.pojo.SchoolInfo;
import com.fslq.service.SchoolService;
import org.openqa.selenium.By;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
import org.openqa.selenium.support.ui.ExpectedConditions;
import org.openqa.selenium.support.ui.WebDriverWait;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import java.io.IOException;
import java.util.*;
import static java.lang.Thread.sleep;
/*相关操作*/
@Component
public class AppProcessor implements PageProcessor {
@Autowired
SchoolCondDataPipeline schoolCondDataPipeline;
public static final ObjectMapper MAPPER=new ObjectMapper();//解析json数据
@Override
public void process(Page page) {
}
private Site site=Site.me()
.setCharset( "utf8" )
.setCycleRetryTimes( 3 )
.setSleepTime( 10* 1000 );
//.addHeader( "Connection","keep-alive" );
@Override
public Site getSite() {
return site;
}
//执行爬虫
@Scheduled(initialDelay=1000,fixedDelay=10*1000)//定时任务不该有参数
public void fun() {
Spider.create( new AppProcessor() )
.addUrl( "https://gkcx.eol.cn/school/search?recomschprop=%E7%BB%BC%E5%90%88&argschtype=%E6%99%AE%E9%80%9A%E6%9C%AC%E7%A7%91" )//设置爬取页面
.thread(1)
.addPipeline( schoolCondDataPipeline )//数据入库
.run();//执行爬虫
}
}
package com.fslq;
import com.fslq.model.StartParam;
import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.boot.context.properties.EnableConfigurationProperties;
import org.springframework.scheduling.annotation.EnableScheduling;
//SpringBoot:开启定时任务
@SpringBootApplication
@EnableScheduling//计划表
public class Application {
public static void main(String[] args) {
SpringApplication.run( Application.class,args );
}
}
create table school_cond(
id int(2) auto_increment
primary key,
school_id int(2) unique null,
name varchar(60) null,
`rank` int(2) null,
rank_type int(2) null,
level_name varchar(18) null,
belong varchar(45) null,
city_name varchar(21) null,
county_name varchar(18) null,
province_name varchar(18) null,
nature_name varchar(18) null
);
1.2.3 通过模拟事件爬取(加密)
针对部分网站的json字符串,存在被加密现象,因此,需要换种方式爬取数据,这里采用Chrome75,chromedriver, [selenium](https://baike.baidu.com/item/Selenium/18266?fr=aladdin) ,来爬取数据,效率比较低,但很有效,这里爬取的是招生计划,重点在于各种xpath元素路径的定位。
package com.fslq.pipeline;
import com.fslq.pojo.EnrollmentPlan;
import com.fslq.service.SchoolService;
import org.openqa.selenium.*;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
import org.openqa.selenium.interactions.Actions;
import org.openqa.selenium.support.ui.ExpectedConditions;
import org.openqa.selenium.support.ui.WebDriverWait;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;
import java.io.*;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicReference;
import static java.lang.Thread.sleep;
/*招生计划*/
@Component
public class EnrollmentPlanDataPipeline implements Pipeline {
@Autowired
private SchoolService schoolService;
@Override
public void process(ResultItems resultItems, Task task) {
// String path="D:\\WorkSpace\\Idea\\crawler\\crawler_college\\src\\main\\resources\\startparam.txt";
// String path=this.getClass().getClassLoader().getResource("./startparam.txt").getPath() ;
// System.setProperty( "webdriver.chrome.driver","C:\\Users\\22966\\AppData\\Local\\Google\\Chrome\\Application\\chromedriver.exe" );
WebDriver driver=new ChromeDriver( /*new ChromeOptions().addArguments( "--headless" )*/ );
try{
List<Long> allSchoolId=schoolService.IsExistAllId();
System.out.println("遍历ID:"+allSchoolId);
AtomicInteger i=new AtomicInteger(1);
do {
allSchoolId.stream().forEach( p->{
//loop crawler the exception school
/* String s=null;
try( BufferedReader br=new BufferedReader( new InputStreamReader(new FileInputStream(new File(path)),StandardCharsets.UTF_8 ) )
){
s=br.readLine();
}catch(IOException e){
e.printStackTrace();
}
int startParam= Integer.parseInt( s.trim());*/
System.out.println("第"+i.get()+"个学校编号:"+p );
try {
sleep( 10*i.get() );
Set<EnrollmentPlan> enrollmentPlan=findEnrollmentPlan( driver,p );
sleep( 50*i.get() );
System.out.println("*********************************************");
int count=0;
/* for (EnrollmentPlan plan : enrollmentPlan) {
if (plan.getPlanNum()==0) {
count++;
}
}*/
//System.out.println(enrollmentPlan);
System.out.println("*********************************************");
if (count==0)
{
//sleep( 20*i.get() );
schoolService.saveAllEnrollmentPlan(enrollmentPlan);
sleep( 25*i.get() );
}
} catch (Exception e) {
/* try(BufferedWriter bw=new BufferedWriter( new OutputStreamWriter(new FileOutputStream(new File(path)),StandardCharsets.UTF_8 ))){
bw.write(String.valueOf( i.get()) );
i.set( i.get()-1 );
bw.flush();
}catch(IOException ex){
ex.printStackTrace();
}*/
System.out.println("第"+i.get()+"个学校编号:"+p+"歇菜:"+e.getMessage() );
}
i.getAndIncrement();
} );
allSchoolId=schoolService.IsExistAllId();
}while (allSchoolId.size()==0);
}catch(Exception e){
System.out.println( "异常:"+e.getMessage() );
} finally {
//关闭浏览器(这个包括驱动完全退出,会清除内存),close 是只关闭浏览器
driver.quit();
}
}
private Set<EnrollmentPlan> findEnrollmentPlan(WebDriver driver, Long p) throws InterruptedException {
int time=30;
Set<EnrollmentPlan> lists=new HashSet<>( );
//1.Get first page
//https://gkcx.eol.cn/school/338/provinceline
WebDriverWait wait=new WebDriverWait( driver,50 );
/**
* /html/body/div[4]/div/div[2]/div/div[2]/div[1]/div
*请选择所在省份及城市
* /html/body/div[4]/div/div[2]/div/div[2]/div[3]/button
*/
JavascriptExecutor driver_js=((JavascriptExecutor) driver);
AtomicReference<String> path=new AtomicReference<>( "/html/body/div[4]/div/div[2]/div/div[2]/div[3]/button" );
driver.get("https://gkcx.eol.cn/school/" +p+"/provinceline");
sleep( 3*time );
// if(isLocation(driver,path.get() ))
// wait.until(ExpectedConditions.presenceOfElementLocated(By.xpath(path.get()))).click();
// sleep( time );
path.set( "/html/body/div[1]/div/div/div/div/div/div/div[3]/div[1]/div[1]/div/div[2]/div[1]/div/div[3]/form/div/div/div/span/div/div/div/div" );
/* if(wait.until( ExpectedConditions.elementToBeClickable( driver.findElement( By.xpath( path.get() ) ) ) ).isDisplayed()){
wait.until(ExpectedConditions.presenceOfElementLocated(By.xpath( path.get() ))).click();
}*/
WebElement until3 = wait.until(ExpectedConditions.presenceOfElementLocated(By.xpath(path.get())));
driver_js.executeScript( "arguments[0].click();",until3 );
// sleep(time*3);
path.set( "/html/body/div[4]/div/div/div/ul/li" );
//waitToBeLoaded( driver,path.get() );
sleep(time);
List<WebElement> until1=wait.until( ExpectedConditions.presenceOfAllElementsLocatedBy( By.xpath( path.get() ) ) );
// driver_js.executeScript( "arguments[0].click();",until1 );
if(until1.size()<2&&(until1.get( 0 ).getText().contains( "年份" )||!until1.get( 0 ).getText().contains( "2018" )))
{
EnrollmentPlan enrollmentPlan=new EnrollmentPlan();
enrollmentPlan.setSchoolId( p );
lists.add( enrollmentPlan );
// System.out.println("运行");
return lists;
}else if(until1.size()==1&&until1.get( 0 ).getText().contains( "2018" )) {
// System.out.println("运行1");
wait.until( ExpectedConditions.presenceOfElementLocated(By.xpath( path.get() ) ) ).click();
}else if(until1.size()>2){
// System.out.println("运行2");
WebElement until = wait.until(ExpectedConditions.presenceOfElementLocated(By.xpath(path.get() + "[2]")));
driver_js.executeScript( "arguments[0].click();",until );
}
//3.Choose province
path.set( "//*[@id='root']/div/div/div/div/div/div/div[3]/div[1]/div[1]/div/div[2]/div[1]/div/div[1]/div/div/div" );
// waitToBeLoaded( driver,path.get() );
WebElement until7 = wait.until(ExpectedConditions.presenceOfElementLocated(By.xpath(path.get())));
driver_js.executeScript( "arguments[0].click();",until7 );
path.set( "/html/body/div[5]/div/div/div/ul/li" );
//waitToBeLoaded( driver,path.get() );
List<WebElement> provinces = wait.until(ExpectedConditions.presenceOfAllElementsLocatedBy(By.xpath( path.get() )));
for (WebElement s : provinces) {
List<EnrollmentPlan> list=new ArrayList<>();
path.set( "//*[@id='root']/div/div/div/div/div/div/div[3]/div[1]/div[1]/div/div[2]/div[1]/div/div[1]/div/div/div" );
waitToBeLoaded( driver,path.get() );
WebElement until8 = wait.until(ExpectedConditions.presenceOfElementLocated(By.xpath(path.get())));
driver_js.executeScript( "arguments[0].click();",until8 );
try{
sleep( time *3 );
}catch(InterruptedException e){
e.printStackTrace();
}
Actions actions=new Actions( driver ).moveToElement( s );
//二级菜单下拉框滚动
actions.sendKeys( Keys.ARROW_DOWN );
actions.click( s ).perform();
try{
sleep( time *2);
}catch(InterruptedException e){
e.printStackTrace();
}
String temp_pro=wait.until( ExpectedConditions.presenceOfElementLocated( By.xpath( path.get()+"/div" ) ) ).getAttribute( "title" );
//4.Choose subject type
path.set( "/html/body/div[1]/div/div/div/div/div/div/div[3]/div[1]/div[1]/div/div[2]/div[1]/div/div[2]/div/div/div" );
waitToBeLoaded( driver,path.get() );
WebElement until6 = wait.until(ExpectedConditions.presenceOfElementLocated(By.xpath(path.get())));
driver_js.executeScript( "arguments[0].click();",until6 );
path.set( "/html/body/div[6]/div/div/div/ul/li" );
waitToBeLoaded( driver,path.get() );
List<WebElement> sub_types=wait.until( ExpectedConditions.presenceOfAllElementsLocatedBy( By.xpath( path.get() ) ) );
for (WebElement sub : sub_types) {
if(sub.getText().equals( "科类" ))
continue;
path.set( "/html/body/div[1]/div/div/div/div/div/div/div[3]/div[1]/div[1]/div/div[2]/div[1]/div/div[2]/div/div/div" );
waitToBeLoaded( driver,path.get() );
//4.Choose subject type
WebElement until5 = wait.until(ExpectedConditions.presenceOfElementLocated(By.xpath(path.get())));
driver_js.executeScript( "arguments[0].click();",until5 );
new Actions( driver ).moveToElement( sub ).click().perform();
//5.Choose betch
path.set( "/html/body/div[1]/div/div/div/div/div/div/div[3]/div[1]/div[1]/div/div[2]/div[1]/div/div[4]/form/div/div/div/span/div" );
waitToBeLoaded( driver,path.get() );
WebElement until=wait.until( ExpectedConditions.presenceOfElementLocated( By.xpath( path.get() ) ) );
//处理异常:element click intercepted: Element <div class="ant-select-selection
driver_js.executeScript( "arguments[0].click();",until );
path.set( "/html/body/div[7]/div/div/div/ul/li" );
waitToBeLoaded( driver,path.get() );
try{
sleep( 2*time );
}catch(InterruptedException e){
e.printStackTrace();
}
// System.out.println("*******1");
List<WebElement> betchs=wait.until( ExpectedConditions.presenceOfAllElementsLocatedBy( By.xpath( path.get() ) ) );
// System.out.println("**********"+betchs.size());
for (WebElement b : betchs) {
try{
sleep( time*2 );
}catch(InterruptedException e){
e.printStackTrace();
}
// System.out.println("*********3");
new Actions( driver ).moveToElement( b ).click().perform();
// System.out.println("********4");
String temp_bet=driver.findElement( By.xpath( "/html/body/div[1]/div/div/div/div/div/div/div[3]/div[1]/div[1]/div/div[2]/div[1]/div/div[4]/form/div/div/div/span/div/div" ) ).getText();
try{
sleep( time*2 );
}catch(InterruptedException e){
e.printStackTrace();
}
// System.out.println("***********5");
if(temp_bet.equals( "批次" ))
continue;
//6.Get page's content
path.set( "/html/body/div[1]/div/div/div/div/div/div/div[3]/div[1]/div[1]/div/div[2]/div[2]/div[1]/table/tbody/tr[2]" );
boolean isEmpty=IsEmptyTds(driver,path.get());
if(!isEmpty)
{
path.set( "/html/body/div[1]/div/div/div/div/div/div/div[3]/div[1]/div[1]/div/div[2]/div[2]/div[3]/div/div/ul/li" );
List<WebElement> do_page=wait.until( ExpectedConditions.presenceOfAllElementsLocatedBy( By.xpath( path.get()) ) );
Integer last_page_num=do_page.size()-2;
//6.2 for..
do {
//6.3 get..<tr>
path.set( "/html/body/div[1]/div/div/div/div/div/div/div[3]/div[1]/div[1]/div/div[2]/div[2]/div[1]/table/tbody/tr[position()>1]" );
//waitToBeLoaded( driver,path.get() );
try{
sleep( 10*time );
}catch(InterruptedException e){
e.printStackTrace();
}
List<WebElement> trs=wait.until( ExpectedConditions.presenceOfAllElementsLocatedBy( By.xpath( path.get() ) ) );
for (WebElement tr : trs) {//G
// et <td>
System.out.println("44444444");
sleep(time*25);
List<WebElement> tds=tr.findElements( By.xpath( "td" ) );
String temp=tds.get( 0 ).getText();
if("-".equals( temp )){
// System.out.println( "遇到‘-’跳出" );
continue;
}
EnrollmentPlan enrollmentPlan=new EnrollmentPlan();
enrollmentPlan.setBatch( temp_bet );
enrollmentPlan.setSchoolId( p );
enrollmentPlan.setProvince( temp_pro );
enrollmentPlan.setSpecialName( temp );
enrollmentPlan.setSubjectType( tds.get( 1 ).getText() );
enrollmentPlan.setSpecialType( tds.get( 2 ).getText() );
String temp_num_1=tds.get( 3 ).getText();
String temp_num_2=tds.get( 4 ).getText();
if(!temp_num_1.equals( "-" )&&temp_num_2.equals( "-" ))
enrollmentPlan.setPlanNum( Long.parseLong( temp_num_1 ) );
if(!temp_num_2.equals( "-" )&&temp_num_1.equals( "-" ))
enrollmentPlan.setPlanNum( Long.parseLong( temp_num_2 ) );
sleep( time );
System.out.println(enrollmentPlan);
lists.add( enrollmentPlan );
}
//下一页
if(do_page.size() > 3){
int next=do_page.size()-1;
System.out.println("5555555555");
sleep(time*25);
//wait.until( ExpectedConditions.elementToBeClickable( do_page.get( next ) ) ).click();
WebElement until2 = do_page.get(next);
driver_js.executeScript("arguments[0].click();",until2);
}
last_page_num--;
} while (last_page_num > 0);
}
//批次跳转
path.set( "/html/body/div[1]/div/div/div/div/div/div/div[3]/div[1]/div[1]/div/div[2]/div[1]/div/div[4]/form/div/div/div/span/div" );
waitToBeLoaded( driver,path.get() );
WebElement until2 = wait.until(ExpectedConditions.presenceOfElementLocated(By.xpath(path.get())));
driver_js.executeScript("arguments[0].click();",until2);
}
//批次初始化
if(betchs.size() > 1){
try{
sleep( time );
}catch(InterruptedException e){
e.printStackTrace();
}
path.set( "/html/body/div[1]/div/div/div/div/div/div/div[3]/div[1]/div[1]/div/div[2]/div[1]/div/div[4]/form/div/div/div/span/div/div" );
waitToBeLoaded( driver,path.get() );
WebElement until4 = wait.until(ExpectedConditions.presenceOfElementLocated(By.xpath(path.get())));
driver_js.executeScript( "arguments[0].click();",until4 );
try{
sleep( time );
}catch(InterruptedException e){
e.printStackTrace();
}
path.set( "/html/body/div[7]/div/div/div/ul/li[1]" );
waitToBeLoaded( driver,path.get() );
// System.out.println( "遇到‘-’跳出1" );
WebElement until2=wait.until( ExpectedConditions.presenceOfElementLocated( By.xpath( path.get() ) ) );
// System.out.println( "遇到‘-’跳出2" );
try{
sleep( time );
}catch(InterruptedException e){
e.printStackTrace();
}
// System.out.println( "遇到‘-’跳出3" );
new Actions( driver ).moveToElement( until2 ).click().perform();
// System.out.println( "遇到‘-’跳出4" );
try{
sleep( time );
}catch(InterruptedException e){
e.printStackTrace();
}
}
//鼠标悬浮事件点击
try{
sleep( time );
}catch(InterruptedException e){
e.printStackTrace();
}
}
//schoolService.saveAllEnrollmentPlan( lists );
}
//7 Save lists
return lists;
}
private boolean IsEmptyTds(WebDriver driver,String s) throws InterruptedException {
WebDriverWait wait=new WebDriverWait( driver,20 );
WebElement tr=wait.until( ExpectedConditions.presenceOfElementLocated( By.xpath( s ) ) );
String td=tr.findElements( By.xpath( "td" ) ).get( 0 ).getText();
return td.equals( "-" );
}
//阻塞
private void waitToBeLoaded(WebDriver driver,String path) {
while (!isLocation( driver,path )) {
try{
//System.out.println(1);
sleep( 500 );
}catch(InterruptedException e){
e.printStackTrace();
}
}
try{
sleep( 500 );
}catch(InterruptedException e){
e.printStackTrace();
}
}
//判断
private boolean isLocation(WebDriver driver,String path) {
try{
sleep( 500 );
driver.findElement( By.xpath( path ) );
System.out.println("定位到:"+path);
return true;
}catch(Exception e){
System.out.println("定位不到:"+path);
return false;
}
}
}
#建立临时表
create table if not exists enrollment_plan_temp select * from enrollment_plan where id<816 order by id ASC;
drop temporary table enrollment_plan_temp;
select * from enrollment_plan_temp;
#建立索引
alter table enrollment_plan_temp add index school_id_index(school_id);
alter table enrollment_plan_temp add index school_province_index(province);
alter table enrollment_plan_temp add index school_plan_num_index(plan_num);
alter table enrollment_plan_temp add index school_batch_index(batch);
alter table enrollment_plan_temp add index school_special_name_index(special_name);
drop table enrollment_plan_temp_unique;
create table if not exists enrollment_plan_temp_unique(
id int(2) primary key auto_increment,
school_id int(2) comment '学校编号',
province varchar(30) comment '生源地',
subject_type varchar(30) comment '科类',
batch varchar(35) comment '批次',
special_name varchar(500) comment '专业名称',
special_type varchar(40) comment '专业门类',
plan_num int(2) comment '招生人数'
);
drop procedure enrollment_plan_temp_unique_pro;
create procedure enrollment_plan_temp_unique_pro()
begin
DECLARE id1 int(2) ;
declare school_id1 int(2) ;
declare province1 varchar(30);
declare subject_type1 varchar(30);
declare batch1 varchar(35);
declare special_name1 varchar(500);
declare special_type1 varchar(40);
declare plan_num1 int(2) ;
declare count_num int(2);
declare plan_num_temp int(2);
declare id_temp int(2);
-- 遍历数据结束标志
DECLARE done INT DEFAULT FALSE;
-- 定义游标
DECLARE cur CURSOR FOR select id,school_id,province,subject_type,batch,special_name,special_type,plan_num FROM enrollment_plan_temp;
-- 将结束标志绑定到游标
DECLARE CONTINUE HANDLER FOR NOT FOUND SET done = TRUE;
-- 打开游标
set count_num=0;
set id_temp=0;
OPEN cur;
-- 开始循环
read_loop: LOOP
-- 提取游标里的数据,这里只有一个,多个的话也一样;
FETCH cur INTO id1,school_id1,province1,subject_type1,batch1,special_name1,special_type1,plan_num1;
if done then
leave read_loop;
end if;
-- 业务逻辑
-- 找到重复的
select count(*) into count_num from enrollment_plan_temp_unique t where t.special_name like special_name1 and t.province = province1 and t.school_id=school_id1 ;
if count_num=0 then#表中没有则插入
INSERT into enrollment_plan_temp_unique(school_id,province,subject_type,batch,special_name,special_type,plan_num) VALUES(school_id1,province1,subject_type1,batch1,special_name1,special_type1,plan_num1);
else#重复count_num=1,表中有则更新
begin
update enrollment_plan_temp_unique SET plan_num = plan_num + plan_num1 where id in (select a.id from
(select id
from enrollment_plan_temp_unique
where special_name = special_name1
and province = province1
and school_id = school_id1
) a);
set count_num=0;
end;
end if;
END LOOP;
-- 关闭游标
CLOSE cur;
end;
call enrollment_plan_temp_unique_pro();
#两表合并,表一截取括号前面的于表二字符串匹配
create table if not exists result_college select e.id,e.school_id,e.province,e.special_name,e.subject_type,e.special_type,e.batch,e.plan_num,s.score,s.position_num from enrollment_plan_temp_unique_v3 e ,special_score_temp_unique s where e.school_id=s.school_id and e.province=s.province and e.special_name like concat(substring_index(s.special_name,'(',1),'%');
drop table result_college_unique;
create table if not exists result_college_unique(
id int(2) primary key auto_increment,
school_id int(2) comment '学校编号',
province varchar(30) comment '生源地',
subject_type varchar(30) comment '科类',
batch varchar(35) comment '批次',
special_name varchar(500) comment '专业名称',
special_type varchar(40) comment '专业门类',
plan_num int(2) comment '招生人数',
score int(2) comment '录取分数',
position_num int(2) comment '位次排名'
);
drop procedure result_college_unique_pro;
create procedure result_college_unique_pro()
begin
DECLARE id1 int(2) ;
declare school_id1 int(2) ;
declare province1 varchar(30);
declare subject_type1 varchar(30);
declare batch1 varchar(35);
declare special_name1 varchar(500);
declare special_type1 varchar(40);
declare plan_num1 int(2) ;
declare count_num int(2);
declare score1 int(2);
declare position_num1 int(2);
declare id_temp int(2);
-- 遍历数据结束标志
DECLARE done INT DEFAULT FALSE;
-- 定义游标
DECLARE cur CURSOR FOR select id,school_id,province,subject_type,batch,special_name,special_type,plan_num,score,position_num FROM result_college;
-- 将结束标志绑定到游标
DECLARE CONTINUE HANDLER FOR NOT FOUND SET done = TRUE;
-- 打开游标
set count_num=0;
set id_temp=0;
OPEN cur;
-- 开始循环
read_loop: LOOP
-- 提取游标里的数据,这里只有一个,多个的话也一样;
FETCH cur INTO id1,school_id1,province1,subject_type1,batch1,special_name1,special_type1,plan_num1,score1,position_num1 ;
if done then
leave read_loop;
end if;
-- 业务逻辑
-- 找到重复的
select count(*) into count_num from result_college_unique t where t.special_name = special_name1 and t.province = province1 and t.school_id=school_id1 ;
if count_num=0 then#表中没有则插入
INSERT into result_college_unique(school_id,province,subject_type,batch,special_name,special_type,plan_num,score,position_num) VALUES(school_id1,province1,subject_type1,batch1,special_name1,special_type1,plan_num1,score1,position_num1);
else#重复count_num=1,表中有则更新
begin
set count_num=0;
end;
end if;
END LOOP;
-- 关闭游标
CLOSE cur;
end;
call result_college_unique_pro();
delete from result_college_unique where special_type='-';
select r.id from school_special s ,result_college_unique r where s.specials like concat(substring_index(r.special_name,'类',1),'%') or s.specials like concat(r.special_name,'%') and s.school_id=r.school_id;
create table if not exists results(
id int(2) primary key auto_increment,
school_id int(2) comment '学校编号',
province varchar(30) comment '生源地',
subject_type varchar(30) comment '科类',
batch varchar(35) comment '批次',
special_name varchar(500) comment '专业名称',
special_type varchar(40) comment '专业门类',
plan_num int(2) comment '招生人数',
score int(2) comment '录取分数',
position_num int(2) comment '位次排名',
is_priory int(2) comment '一级学科'
);
insert into results(id, school_id, province, subject_type, batch, special_name, special_type, plan_num, score, position_num) select * from result_college_unique;
update results set is_priory=1 where id IN (select r.id from school_special s ,result_college_unique r where s.specials like concat(substring_index(r.special_name,'类',1),'%') or s.specials like concat(r.special_name,'%') and s.school_id=r.school_id);
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。