去除xpath库,直接用jsoup解析xpath

This commit is contained in:
kunfei 2023-02-22 16:57:49 +08:00
parent 5216e36fbf
commit 39d6239b4b
5 changed files with 38 additions and 59 deletions

View File

@ -191,7 +191,6 @@ dependencies {
//
implementation('org.jsoup:jsoup:1.15.4')
implementation('com.jayway.jsonpath:json-path:2.7.0')
implementation('cn.wanghaomiao:JsoupXpath:2.5.2')
implementation(project(path: ':epublib'))
//JS rhino

View File

@ -2,7 +2,7 @@
{
"name": "百度汉语",
"urlRule": "https://dict.baidu.com/s?wd={{key}}",
"showRule": "@js:var jsoup = org.jsoup.Jsoup.parse(result)\njsoup.select(\"script\").remove()\njsoup.select(\"#word-header\").remove()\njsoup.select(\"#term-header\").remove()\njsoup.select(\".more-button\").remove()\njsoup.select(\".disactive\").remove()\njsoup.select(\"#download-wrapper\").remove()\njsoup.select(\"#right-panel\").remove()\njsoup.select(\"#content-panel\").html()",
"showRule": "@js:var jsoup = org.jsoup.Jsoup.parse(result)\njsoup.select(\"script\").remove()\njsoup.select(\"#word-header\").remove()\njsoup.select(\"#term-header\").remove()\njsoup.select(\".more-button\").remove()\njsoup.select(\".disactive\").remove()\njsoup.select(\"#download-wrapper\").remove()\njsoup.select(\"#upload-dialog\").remove()\njsoup.select(\"#right-panel\").remove()\njsoup.select(\"#content-panel\").html()",
"enabled": true,
"sortNumber": 0
},

View File

@ -6,7 +6,6 @@ import org.jsoup.nodes.Element
import org.jsoup.select.Collector
import org.jsoup.select.Elements
import org.jsoup.select.Evaluator
import org.seimicrawler.xpath.JXNode
/**
* Created by GKF on 2018/1/25.
@ -14,20 +13,16 @@ import org.seimicrawler.xpath.JXNode
*/
@Keep
class AnalyzeByJSoup(doc: Any) {
companion object {
fun parse(doc: Any): Element {
return when (doc) {
is Element -> doc
is JXNode -> if (doc.isElement) doc.asElement() else Jsoup.parse(doc.toString())
else -> Jsoup.parse(doc.toString())
}
}
}
private var element: Element = parse(doc)
private fun parse(doc: Any): Element {
return when (doc) {
is Element -> doc
else -> Jsoup.parse(doc.toString())
}
}
/**
* 获取列表
*/
@ -470,7 +465,6 @@ class AnalyzeByJSoup(doc: Any) {
l = "" //清空
curMinus = false //重置
}
}
split = ' '

View File

@ -3,62 +3,34 @@ package io.legado.app.model.analyzeRule
import android.text.TextUtils
import androidx.annotation.Keep
import org.jsoup.Jsoup
import org.jsoup.nodes.Document
import org.jsoup.nodes.Element
import org.jsoup.parser.Parser
import org.jsoup.nodes.TextNode
import org.jsoup.select.Elements
import org.seimicrawler.xpath.JXDocument
import org.seimicrawler.xpath.JXNode
@Keep
class AnalyzeByXPath(doc: Any) {
private var jxNode: Any = parse(doc)
private fun parse(doc: Any): Any {
private var element: Element = parse(doc)
private fun parse(doc: Any): Element {
return when (doc) {
is JXNode -> if (doc.isElement) doc else strToJXDocument(doc.toString())
is Document -> JXDocument.create(doc)
is Element -> JXDocument.create(Elements(doc))
is Elements -> JXDocument.create(doc)
else -> strToJXDocument(doc.toString())
is Element -> doc
else -> Jsoup.parse(doc.toString())
}
}
private fun strToJXDocument(html: String): JXDocument {
var html1 = html
if (html1.endsWith("</td>")) {
html1 = "<tr>${html1}</tr>"
}
if (html1.endsWith("</tr>") || html1.endsWith("</tbody>")) {
html1 = "<table>${html1}</table>"
}
if (html1.trim().startsWith("<?xml", true)) {
return JXDocument.create(Jsoup.parse(html1, Parser.xmlParser()))
}
return JXDocument.create(html1)
}
private fun getResult(xPath: String): List<JXNode>? {
val node = jxNode
return if (node is JXNode) {
node.sel(xPath)
} else {
(node as JXDocument).selN(xPath)
}
}
internal fun getElements(xPath: String): List<JXNode>? {
internal fun getElements(xPath: String): Elements? {
if (xPath.isEmpty()) return null
val jxNodes = ArrayList<JXNode>()
val jxNodes = Elements()
val ruleAnalyzes = RuleAnalyzer(xPath)
val rules = ruleAnalyzes.splitRule("&&", "||", "%%")
if (rules.size == 1) {
return getResult(rules[0])
return element.selectXpath(rules[0])
} else {
val results = ArrayList<List<JXNode>>()
val results = ArrayList<Elements>()
for (rl in rules) {
val temp = getElements(rl)
if (temp != null && temp.isNotEmpty()) {
@ -94,8 +66,8 @@ class AnalyzeByXPath(doc: Any) {
val rules = ruleAnalyzes.splitRule("&&", "||", "%%")
if (rules.size == 1) {
getResult(xPath)?.map {
result.add(it.asString())
element.selectXpath(xPath).forEach {
result.add(it.toString())
}
return result
} else {
@ -132,10 +104,22 @@ class AnalyzeByXPath(doc: Any) {
val ruleAnalyzes = RuleAnalyzer(rule)
val rules = ruleAnalyzes.splitRule("&&", "||")
if (rules.size == 1) {
getResult(rule)?.let {
return TextUtils.join("\n", it)
val xpath = when {
rule.startsWith("///") -> ".${rule.substring(1)}"
rule.startsWith("/") -> ".$rule"
else -> rule
}
val x = xpath.substringAfterLast("/")
return if (x.startsWith("@")) {
element.selectXpath(xpath.substringBeforeLast("/"))
.eachAttr(x.substring(1)).let {
TextUtils.join("\n", it)
}
} else {
element.selectXpath(xpath, TextNode::class.java).let {
TextUtils.join("\n", it)
}
}
return null
} else {
val textList = arrayListOf<String>()
for (rl in rules) {

View File

@ -458,7 +458,8 @@ class AnalyzeRule(
mode = Mode.Json
ruleStr
}
ruleStr.startsWith("/") -> {//XPath特征很明显,无需配置单独的识别标头
ruleStr.startsWith("/") || ruleStr.startsWith("./") -> {
//XPath特征很明显,无需配置单独的识别标头
mode = Mode.XPath
ruleStr
}
@ -603,6 +604,7 @@ class AnalyzeRule(
|| ruleStr.startsWith("$.")
|| ruleStr.startsWith("$[")
|| ruleStr.startsWith("//")
|| ruleStr.startsWith("./")
}
}