增加HDU题目爬虫 v0.5

This commit is contained in:
Howie 2021-02-15 16:34:50 +08:00
parent b8b844336a
commit c2a3185356
4 changed files with 76 additions and 18 deletions

View File

@ -1,18 +1,19 @@
package top.hcode.hoj.remoteJudge.task.Impl;
import cn.hutool.core.codec.Base64;
import cn.hutool.core.map.MapUtil;
import cn.hutool.core.util.ReUtil;
import cn.hutool.json.JSONUtil;
import jdk.nashorn.internal.runtime.regexp.RegExp;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang.StringEscapeUtils;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.helper.Validate;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Entities;
import top.hcode.hoj.pojo.entity.Problem;
import top.hcode.hoj.remoteJudge.task.RemoteJudgeStrategy;
import top.hcode.hoj.util.Constants;
import top.hcode.hoj.util.JsoupUtils;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Matcher;
@ -20,12 +21,14 @@ import java.util.regex.Pattern;
@Slf4j
public class HduJudge implements RemoteJudgeStrategy {
public static final String host = "http://acm.hdu.edu.cn";
public static final String loginUrl = "/userloginex.php?action=login";
public static final String submitUrl = "/submit.php?action=submit";
public static final String statusUrl = "/status.php?user=%s&pid=%d";
public static final String queryUrl = "/status.php?first=%d";
public static final String errorUrl = "/viewerror.php?rid=%d";
public static final String JUDGE_NAME = "HDU";
public static final String HOST = "http://acm.hdu.edu.cn";
public static final String LOGIN_URL = "/userloginex.php?action=login";
public static final String SUBMIT_URL = "/submit.php?action=submit";
public static final String STATUS_URL = "/status.php?user=%s&pid=%d";
public static final String QUERY_URL = "/status.php?first=%d";
public static final String ERROR_URL = "/viewerror.php?rid=%d";
public static final String PROBLEM_URL = "/showproblem.php?pid=%s";
/**
* @param problemId 提交的题目id
@ -39,7 +42,7 @@ public class HduJudge implements RemoteJudgeStrategy {
return -1L;
}
Map<String, String> loginCookie = getLoginCookie();
Connection connection = JsoupUtils.getConnectionFromUrl(host + submitUrl, null, loginCookie);
Connection connection = JsoupUtils.getConnectionFromUrl(HOST + SUBMIT_URL, null, loginCookie);
Connection.Response response = JsoupUtils.postResponse(connection, MapUtil
.builder(new HashMap<String, String>())
.put("check", "0")
@ -58,7 +61,7 @@ public class HduJudge implements RemoteJudgeStrategy {
@Override
public Map<String, Object> result(Long submitId) throws Exception {
String url = host + String.format(queryUrl, submitId);
String url = HOST + String.format(QUERY_URL, submitId);
Connection connection = JsoupUtils.getConnectionFromUrl(url, null, null);
Connection.Response response = JsoupUtils.getResponse(connection, null);
// 1提交时间 2结果 3执行时间 4执行空间 5代码长度
@ -83,7 +86,7 @@ public class HduJudge implements RemoteJudgeStrategy {
result.put("memory", Integer.parseInt(executionMemory));
// 如果CE了则还需要获得错误信息
if (statusType == Constants.Judge.STATUS_COMPILE_ERROR) {
connection.url(host + String.format(errorUrl, submitId));
connection.url(HOST + String.format(ERROR_URL, submitId));
response = JsoupUtils.getResponse(connection, null);
String compilationErrorInfo = ReUtil.get("(<pre>[\\s\\S]*?</pre>)", response.body(), 1);
result.put("CEInfo", compilationErrorInfo);
@ -91,9 +94,39 @@ public class HduJudge implements RemoteJudgeStrategy {
return result;
}
@Override
public Problem getProblemInfo(String problemId) throws Exception {
// 验证题号是否符合规范
Validate.isTrue(problemId.matches("[1-9]\\d*"));
Problem info = new Problem();
String url = HOST + String.format(PROBLEM_URL, problemId);
System.out.println(url);
Connection connection = JsoupUtils.getConnectionFromUrl(url, null, null);
Document document = JsoupUtils.getDocument(connection, null);
String html = document.html();
System.out.println(html);
info.setTitle(ReUtil.get("color:#1A5CC8\">([\\s\\S]*?)</h1>", html, 1).trim());
info.setTimeLimit(Integer.parseInt(ReUtil.get("(\\d*) MS", html, 1)));
info.setMemoryLimit(Integer.parseInt(ReUtil.get("/(\\d*) K", html, 1)));
info.setDescription(ReUtil.get(">Problem Description</div>\\s+<.*?>(.*?)<br></div>", html, 1));
info.setInput(ReUtil.get(">Input</div>.*?<.*?>(.*?)<br></div>", html, 1));
info.setOutput(ReUtil.get(">Output</div>.*?<.*?>(.*?)<br></div>", html, 1));
StringBuilder sb = new StringBuilder("<input>");
sb.append(ReUtil.get(">Sample Input</div><div .*?,monospace;\">([\\s\\S]*?)</div></pre>", html, 1));
sb.append("</input><output>");
// TODO 筛选output和hint
sb.append(ReUtil.get(">Sample Output</div><.*?monospace;\">(.*)(<div style=)*?", html, 1)).append("</output>");
info.setExamples(sb.toString());
info.setHint(ReUtil.get("<i>Hint</i></div>([\\s\\S]*?)<br><[^<>]*?panel_title[^<>]*?>", html, 1));
info.setIsRemote(true);
info.setSource(JUDGE_NAME + "-" + problemId);
info.setType(0);
return info;
}
@Override
public Map<String, String> getLoginCookie() throws Exception {
Connection connection = JsoupUtils.getConnectionFromUrl(host + loginUrl, null, null);
Connection connection = JsoupUtils.getConnectionFromUrl(HOST + LOGIN_URL, null, null);
Connection.Response response = JsoupUtils.postResponse(connection, MapUtil
.builder(new HashMap<String, String>())
// TODO 添加账号密码 暂时写死测试后续将在队列中获取空闲账号
@ -125,9 +158,10 @@ public class HduJudge implements RemoteJudgeStrategy {
}
}
public Long getMaxRunId(Connection connection, String userName, Long problemId) throws Exception {
String url = String.format(statusUrl, userName, problemId);
connection.url(host + url);
String url = String.format(STATUS_URL, userName, problemId);
connection.url(HOST + url);
Connection.Response response = JsoupUtils.getResponse(connection, null);
Matcher matcher = Pattern.compile("<td height=22px>(\\d+)").matcher(response.body());
return matcher.find() ? Long.parseLong(matcher.group(1)) : -1L;

View File

@ -1,5 +1,7 @@
package top.hcode.hoj.remoteJudge.task;
import top.hcode.hoj.pojo.entity.Problem;
import java.util.Map;
@ -25,4 +27,11 @@ public interface RemoteJudgeStrategy {
String getLanguage(String language);
/**
* @param problemId String的原因是因为某些题库题号不是纯数字
* @return 返回Problem对象
* @throws Exception
*/
Problem getProblemInfo(String problemId) throws Exception;
}

View File

@ -5,6 +5,7 @@ import cn.hutool.json.JSONObject;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Entities;
import java.io.IOException;
import java.util.Map;
@ -47,6 +48,16 @@ public class JsoupUtils {
}
return connection.method(Connection.Method.GET).execute();
}
public static Document getDocument(Connection connection, Map<String, String> getData) throws IOException {
//添加参数
if (getData != null) {
connection.data(getData);
}
Document document = connection.get();
document.outputSettings().escapeMode(Entities.EscapeMode.xhtml);
document.outputSettings().prettyPrint(false);
return document;
}
}

View File

@ -1,6 +1,7 @@
package top.hcode.hoj.remoteJudge.task.Impl;
import org.junit.jupiter.api.Test;
import top.hcode.hoj.pojo.entity.Problem;
import java.io.IOException;
import java.util.Map;
@ -14,8 +15,11 @@ class HduJudgeTest {
HduJudge hduJudge = new HduJudge();
try {
Map<String, Object> submit = hduJudge.result(35329033L);
System.out.println(submit);
// 有hint
Problem problemInfo = hduJudge.getProblemInfo("1425");
// 无hint
// Problem problemInfo = hduJudge.getProblemInfo("1090");
System.out.println(problemInfo);
} catch (Exception e) {
e.printStackTrace();
}