去除xpath库,直接用jsoup解析xpath

This commit is contained in:
kunfei 2023-02-22 16:57:49 +08:00
parent 5216e36fbf
commit 39d6239b4b
5 changed files with 38 additions and 59 deletions

View File

@ -191,7 +191,6 @@ dependencies {
// //
implementation('org.jsoup:jsoup:1.15.4') implementation('org.jsoup:jsoup:1.15.4')
implementation('com.jayway.jsonpath:json-path:2.7.0') implementation('com.jayway.jsonpath:json-path:2.7.0')
implementation('cn.wanghaomiao:JsoupXpath:2.5.2')
implementation(project(path: ':epublib')) implementation(project(path: ':epublib'))
//JS rhino //JS rhino

View File

@ -2,7 +2,7 @@
{ {
"name": "百度汉语", "name": "百度汉语",
"urlRule": "https://dict.baidu.com/s?wd={{key}}", "urlRule": "https://dict.baidu.com/s?wd={{key}}",
"showRule": "@js:var jsoup = org.jsoup.Jsoup.parse(result)\njsoup.select(\"script\").remove()\njsoup.select(\"#word-header\").remove()\njsoup.select(\"#term-header\").remove()\njsoup.select(\".more-button\").remove()\njsoup.select(\".disactive\").remove()\njsoup.select(\"#download-wrapper\").remove()\njsoup.select(\"#right-panel\").remove()\njsoup.select(\"#content-panel\").html()", "showRule": "@js:var jsoup = org.jsoup.Jsoup.parse(result)\njsoup.select(\"script\").remove()\njsoup.select(\"#word-header\").remove()\njsoup.select(\"#term-header\").remove()\njsoup.select(\".more-button\").remove()\njsoup.select(\".disactive\").remove()\njsoup.select(\"#download-wrapper\").remove()\njsoup.select(\"#upload-dialog\").remove()\njsoup.select(\"#right-panel\").remove()\njsoup.select(\"#content-panel\").html()",
"enabled": true, "enabled": true,
"sortNumber": 0 "sortNumber": 0
}, },

View File

@ -6,7 +6,6 @@ import org.jsoup.nodes.Element
import org.jsoup.select.Collector import org.jsoup.select.Collector
import org.jsoup.select.Elements import org.jsoup.select.Elements
import org.jsoup.select.Evaluator import org.jsoup.select.Evaluator
import org.seimicrawler.xpath.JXNode
/** /**
* Created by GKF on 2018/1/25. * Created by GKF on 2018/1/25.
@ -14,20 +13,16 @@ import org.seimicrawler.xpath.JXNode
*/ */
@Keep @Keep
class AnalyzeByJSoup(doc: Any) { class AnalyzeByJSoup(doc: Any) {
companion object {
fun parse(doc: Any): Element {
return when (doc) {
is Element -> doc
is JXNode -> if (doc.isElement) doc.asElement() else Jsoup.parse(doc.toString())
else -> Jsoup.parse(doc.toString())
}
}
}
private var element: Element = parse(doc) private var element: Element = parse(doc)
private fun parse(doc: Any): Element {
return when (doc) {
is Element -> doc
else -> Jsoup.parse(doc.toString())
}
}
/** /**
* 获取列表 * 获取列表
*/ */
@ -470,7 +465,6 @@ class AnalyzeByJSoup(doc: Any) {
l = "" //清空 l = "" //清空
curMinus = false //重置 curMinus = false //重置
} }
} }
split = ' ' split = ' '

View File

@ -3,62 +3,34 @@ package io.legado.app.model.analyzeRule
import android.text.TextUtils import android.text.TextUtils
import androidx.annotation.Keep import androidx.annotation.Keep
import org.jsoup.Jsoup import org.jsoup.Jsoup
import org.jsoup.nodes.Document
import org.jsoup.nodes.Element import org.jsoup.nodes.Element
import org.jsoup.parser.Parser import org.jsoup.nodes.TextNode
import org.jsoup.select.Elements import org.jsoup.select.Elements
import org.seimicrawler.xpath.JXDocument
import org.seimicrawler.xpath.JXNode
@Keep @Keep
class AnalyzeByXPath(doc: Any) { class AnalyzeByXPath(doc: Any) {
private var jxNode: Any = parse(doc)
private fun parse(doc: Any): Any { private var element: Element = parse(doc)
private fun parse(doc: Any): Element {
return when (doc) { return when (doc) {
is JXNode -> if (doc.isElement) doc else strToJXDocument(doc.toString()) is Element -> doc
is Document -> JXDocument.create(doc) else -> Jsoup.parse(doc.toString())
is Element -> JXDocument.create(Elements(doc))
is Elements -> JXDocument.create(doc)
else -> strToJXDocument(doc.toString())
} }
} }
private fun strToJXDocument(html: String): JXDocument { internal fun getElements(xPath: String): Elements? {
var html1 = html
if (html1.endsWith("</td>")) {
html1 = "<tr>${html1}</tr>"
}
if (html1.endsWith("</tr>") || html1.endsWith("</tbody>")) {
html1 = "<table>${html1}</table>"
}
if (html1.trim().startsWith("<?xml", true)) {
return JXDocument.create(Jsoup.parse(html1, Parser.xmlParser()))
}
return JXDocument.create(html1)
}
private fun getResult(xPath: String): List<JXNode>? {
val node = jxNode
return if (node is JXNode) {
node.sel(xPath)
} else {
(node as JXDocument).selN(xPath)
}
}
internal fun getElements(xPath: String): List<JXNode>? {
if (xPath.isEmpty()) return null if (xPath.isEmpty()) return null
val jxNodes = ArrayList<JXNode>() val jxNodes = Elements()
val ruleAnalyzes = RuleAnalyzer(xPath) val ruleAnalyzes = RuleAnalyzer(xPath)
val rules = ruleAnalyzes.splitRule("&&", "||", "%%") val rules = ruleAnalyzes.splitRule("&&", "||", "%%")
if (rules.size == 1) { if (rules.size == 1) {
return getResult(rules[0]) return element.selectXpath(rules[0])
} else { } else {
val results = ArrayList<List<JXNode>>() val results = ArrayList<Elements>()
for (rl in rules) { for (rl in rules) {
val temp = getElements(rl) val temp = getElements(rl)
if (temp != null && temp.isNotEmpty()) { if (temp != null && temp.isNotEmpty()) {
@ -94,8 +66,8 @@ class AnalyzeByXPath(doc: Any) {
val rules = ruleAnalyzes.splitRule("&&", "||", "%%") val rules = ruleAnalyzes.splitRule("&&", "||", "%%")
if (rules.size == 1) { if (rules.size == 1) {
getResult(xPath)?.map { element.selectXpath(xPath).forEach {
result.add(it.asString()) result.add(it.toString())
} }
return result return result
} else { } else {
@ -132,10 +104,22 @@ class AnalyzeByXPath(doc: Any) {
val ruleAnalyzes = RuleAnalyzer(rule) val ruleAnalyzes = RuleAnalyzer(rule)
val rules = ruleAnalyzes.splitRule("&&", "||") val rules = ruleAnalyzes.splitRule("&&", "||")
if (rules.size == 1) { if (rules.size == 1) {
getResult(rule)?.let { val xpath = when {
return TextUtils.join("\n", it) rule.startsWith("///") -> ".${rule.substring(1)}"
rule.startsWith("/") -> ".$rule"
else -> rule
}
val x = xpath.substringAfterLast("/")
return if (x.startsWith("@")) {
element.selectXpath(xpath.substringBeforeLast("/"))
.eachAttr(x.substring(1)).let {
TextUtils.join("\n", it)
}
} else {
element.selectXpath(xpath, TextNode::class.java).let {
TextUtils.join("\n", it)
}
} }
return null
} else { } else {
val textList = arrayListOf<String>() val textList = arrayListOf<String>()
for (rl in rules) { for (rl in rules) {

View File

@ -458,7 +458,8 @@ class AnalyzeRule(
mode = Mode.Json mode = Mode.Json
ruleStr ruleStr
} }
ruleStr.startsWith("/") -> {//XPath特征很明显,无需配置单独的识别标头 ruleStr.startsWith("/") || ruleStr.startsWith("./") -> {
//XPath特征很明显,无需配置单独的识别标头
mode = Mode.XPath mode = Mode.XPath
ruleStr ruleStr
} }
@ -603,6 +604,7 @@ class AnalyzeRule(
|| ruleStr.startsWith("$.") || ruleStr.startsWith("$.")
|| ruleStr.startsWith("$[") || ruleStr.startsWith("$[")
|| ruleStr.startsWith("//") || ruleStr.startsWith("//")
|| ruleStr.startsWith("./")
} }
} }