repair a bug where a title request has multiple results

This commit is contained in:
or 2019-08-05 16:29:26 +08:00
parent f57d03d58a
commit 8ec4bf848f
1 changed files with 47 additions and 22 deletions

View File

@ -2,6 +2,7 @@ package cn.piflow.bundle.nsfc.externalAcquisition
import java.io.{ByteArrayInputStream, IOException} import java.io.{ByteArrayInputStream, IOException}
import java.net.URLEncoder import java.net.URLEncoder
import java.util
import java.util.regex.{Matcher, Pattern} import java.util.regex.{Matcher, Pattern}
import cn.piflow.conf.bean.PropertyDescriptor import cn.piflow.conf.bean.PropertyDescriptor
@ -27,7 +28,7 @@ class ScopusSearchArticle extends ConfigurableStop {
override val outportList: List[String] = List(PortEnum.scopus_articlePort.toString,PortEnum.scopus_article_api_response.toString) override val outportList: List[String] = List(PortEnum.scopus_articlePort.toString,PortEnum.scopus_article_api_response.toString)
var author = new StringBuilder
var intervalTime:String = _ var intervalTime:String = _
var apiKey :String= _ var apiKey :String= _
var httpurl :String= _ var httpurl :String= _
@ -66,11 +67,22 @@ class ScopusSearchArticle extends ConfigurableStop {
var eid:String =null var eid:String =null
var prismUrls: util.ArrayList[String]= new util.ArrayList[String]()
var scopusIds: util.ArrayList[String]= new util.ArrayList[String]()
var num = 1
try { try {
inDf.collect().foreach(x=>{ inDf.collect().foreach(x=>{
title = URLEncoder.encode(x.get(0).asInstanceOf[String],"UTF-8") title = URLEncoder.encode(x.get(0).asInstanceOf[String],"UTF-8")
titleUrl = httpurl+s"(${title})&apiKey=${apiKey}&httpAccept=application%2Fjson" titleUrl = httpurl+s"(${title})&apiKey=${apiKey}&httpAccept=application%2Fjson"
println(titleUrl)
titleJsonString = getHttp(titleUrl) titleJsonString = getHttp(titleUrl)
// Filter valid request scoups files // Filter valid request scoups files
@ -79,32 +91,41 @@ class ScopusSearchArticle extends ConfigurableStop {
titleJsonString = titleJsonString.replace("\"dc:identifier\":\"SCOPUS_ID:","\"scopus_id\":\"") titleJsonString = titleJsonString.replace("\"dc:identifier\":\"SCOPUS_ID:","\"scopus_id\":\"")
reg = "scopus_id\":\"(.*?)\"," reg = "scopus_id\":\"(.*?)\","
scopus_id = regula(reg,titleJsonString) scopusIds = regula(reg,titleJsonString)
println(scopus_id)
// Request author information // Request author information
if (titleJsonString.contains("prism:url")) { if (titleJsonString.contains("prism:url")) {
reg = "prism:url\":\"(.*?)\"," reg = "prism:url\":\"(.*?)\","
prismUrl= regula(reg,titleJsonString)+s"?field=author&apikey=${apiKey}&httpAccept=application%2Fjson"
prismUrls = regula(reg, titleJsonString)
for (x <- 0 until prismUrls.size) {
println(num+"----------------------"+scopusIds.get(x))
num+=1
prismUrl = prismUrls.get(x) + s"?field=author&apikey=${apiKey}&httpAccept=application%2Fjson"
authorJsonString = getHttp(prismUrl) authorJsonString = getHttp(prismUrl)
reg = "preferred-name\": \\{\n.*?\"ce:given-name\": \"(.*?)\",\n.*?,\n.*?ce:surname\": \"(.*?)\"," reg = "preferred-name\": \\{\n.*?\"ce:given-name\": \"(.*?)\",\n.*?,\n.*?ce:surname\": \"(.*?)\","
authorString = regulaAuthor(reg, authorJsonString).stripSuffix("#") authorString = regulaAuthor(reg, authorJsonString).stripSuffix("#")
titleJsonString=titleJsonString.replace("\"prism:url\":","\"authorString\":\""+authorString+"\",\"prism:url\":")
titleJsonString = titleJsonString.replace("\"prism:url\":\"" + prismUrls.get(x), "\"authorString\":\"" + authorString + "\",\"prism:url\":\"" + prismUrls.get(x))
inputStream = new ByteArrayInputStream((scopusIds.get(x) + "##&##" + titleJsonString + "##&##" + authorJsonString.replace("\n", " ") + "\n").getBytes("utf-8"))
IOUtils.copyBytes(inputStream, scopus_api_responseOut, 4096, false)
} }
}
// Request summary information // Request summary information
if (titleJsonString.contains("eid")){ // if (titleJsonString.contains("eid")) reg = "eid\":\"(.*?)\","
reg = "eid\":\"(.*?)\","
eid = regula(reg,titleJsonString)
}
println(scopus_id)
inputStream = new ByteArrayInputStream((scopus_id+"##&##"+titleJsonString+"##&##"+authorJsonString.replace("\n"," ")+ "\n").getBytes("utf-8"))
IOUtils.copyBytes(inputStream, scopus_api_responseOut, 4096, false)
inputStream = new ByteArrayInputStream((titleJsonString.toString + "\n").getBytes("utf-8")) inputStream = new ByteArrayInputStream((titleJsonString.toString + "\n").getBytes("utf-8"))
@ -241,17 +262,21 @@ class ScopusSearchArticle extends ConfigurableStop {
} }
var value:String = null var arrayString = new util.ArrayList[String]()
def regula(reg:String,orgString:String) :String ={ def regula(reg:String,orgString:String):util.ArrayList[String] ={
arrayString.clear()
val pattern: Pattern = Pattern.compile(reg) val pattern: Pattern = Pattern.compile(reg)
val matcher: Matcher = pattern.matcher(orgString.toString) val matcher: Matcher = pattern.matcher(orgString.toString)
if (matcher.find()) value = matcher.group(1) while (matcher.find()) {
else value = "value not exists" arrayString.add(matcher.group(1))
return value
} }
return arrayString
}
var author = new StringBuilder
def regulaAuthor(reg:String,orgString:String) :StringBuilder ={ def regulaAuthor(reg:String,orgString:String) :StringBuilder ={
author.clear()
val pattern: Pattern = Pattern.compile(reg) val pattern: Pattern = Pattern.compile(reg)
val matcher: Matcher = pattern.matcher(orgString.toString) val matcher: Matcher = pattern.matcher(orgString.toString)
while (matcher.find()) author.append(matcher.group(2)+" "+matcher.group(1)+"#") while (matcher.find()) author.append(matcher.group(2)+" "+matcher.group(1)+"#")