repair a bug where a title request has multiple results

This commit is contained in:
or 2019-08-05 16:29:26 +08:00
parent f57d03d58a
commit 8ec4bf848f
1 changed files with 47 additions and 22 deletions

View File

@ -2,6 +2,7 @@ package cn.piflow.bundle.nsfc.externalAcquisition
import java.io.{ByteArrayInputStream, IOException}
import java.net.URLEncoder
import java.util
import java.util.regex.{Matcher, Pattern}
import cn.piflow.conf.bean.PropertyDescriptor
@ -27,7 +28,7 @@ class ScopusSearchArticle extends ConfigurableStop {
override val outportList: List[String] = List(PortEnum.scopus_articlePort.toString,PortEnum.scopus_article_api_response.toString)
var author = new StringBuilder
var intervalTime:String = _
var apiKey :String= _
var httpurl :String= _
@ -66,11 +67,22 @@ class ScopusSearchArticle extends ConfigurableStop {
var eid:String =null
var prismUrls: util.ArrayList[String]= new util.ArrayList[String]()
var scopusIds: util.ArrayList[String]= new util.ArrayList[String]()
var num = 1
try {
inDf.collect().foreach(x=>{
title = URLEncoder.encode(x.get(0).asInstanceOf[String],"UTF-8")
titleUrl = httpurl+s"(${title})&apiKey=${apiKey}&httpAccept=application%2Fjson"
println(titleUrl)
titleJsonString = getHttp(titleUrl)
// Filter valid request scoups files
@ -79,32 +91,41 @@ class ScopusSearchArticle extends ConfigurableStop {
titleJsonString = titleJsonString.replace("\"dc:identifier\":\"SCOPUS_ID:","\"scopus_id\":\"")
reg = "scopus_id\":\"(.*?)\","
scopus_id = regula(reg,titleJsonString)
println(scopus_id)
scopusIds = regula(reg,titleJsonString)
// Request author information
if (titleJsonString.contains("prism:url")){
if (titleJsonString.contains("prism:url")) {
reg = "prism:url\":\"(.*?)\","
prismUrl= regula(reg,titleJsonString)+s"?field=author&apikey=${apiKey}&httpAccept=application%2Fjson"
prismUrls = regula(reg, titleJsonString)
for (x <- 0 until prismUrls.size) {
println(num+"----------------------"+scopusIds.get(x))
num+=1
prismUrl = prismUrls.get(x) + s"?field=author&apikey=${apiKey}&httpAccept=application%2Fjson"
authorJsonString = getHttp(prismUrl)
reg = "preferred-name\": \\{\n.*?\"ce:given-name\": \"(.*?)\",\n.*?,\n.*?ce:surname\": \"(.*?)\","
authorString = regulaAuthor(reg,authorJsonString).stripSuffix("#")
authorString = regulaAuthor(reg, authorJsonString).stripSuffix("#")
titleJsonString = titleJsonString.replace("\"prism:url\":\"" + prismUrls.get(x), "\"authorString\":\"" + authorString + "\",\"prism:url\":\"" + prismUrls.get(x))
inputStream = new ByteArrayInputStream((scopusIds.get(x) + "##&##" + titleJsonString + "##&##" + authorJsonString.replace("\n", " ") + "\n").getBytes("utf-8"))
IOUtils.copyBytes(inputStream, scopus_api_responseOut, 4096, false)
titleJsonString=titleJsonString.replace("\"prism:url\":","\"authorString\":\""+authorString+"\",\"prism:url\":")
}
}
// Request summary information
if (titleJsonString.contains("eid")){
reg = "eid\":\"(.*?)\","
eid = regula(reg,titleJsonString)
}
// if (titleJsonString.contains("eid")) reg = "eid\":\"(.*?)\","
println(scopus_id)
inputStream = new ByteArrayInputStream((scopus_id+"##&##"+titleJsonString+"##&##"+authorJsonString.replace("\n"," ")+ "\n").getBytes("utf-8"))
IOUtils.copyBytes(inputStream, scopus_api_responseOut, 4096, false)
inputStream = new ByteArrayInputStream((titleJsonString.toString + "\n").getBytes("utf-8"))
@ -241,17 +262,21 @@ class ScopusSearchArticle extends ConfigurableStop {
}
var value:String = null
def regula(reg:String,orgString:String) :String ={
var arrayString = new util.ArrayList[String]()
def regula(reg:String,orgString:String):util.ArrayList[String] ={
arrayString.clear()
val pattern: Pattern = Pattern.compile(reg)
val matcher: Matcher = pattern.matcher(orgString.toString)
if (matcher.find()) value = matcher.group(1)
else value = "value not exists"
return value
while (matcher.find()) {
arrayString.add(matcher.group(1))
}
return arrayString
}
var author = new StringBuilder
def regulaAuthor(reg:String,orgString:String) :StringBuilder ={
author.clear()
val pattern: Pattern = Pattern.compile(reg)
val matcher: Matcher = pattern.matcher(orgString.toString)
while (matcher.find()) author.append(matcher.group(2)+" "+matcher.group(1)+"#")