mirror of
https://github.com/gedoor/legado.git
synced 2024-08-30 09:23:26 +08:00
去除xpath库,直接用jsoup解析xpath
This commit is contained in:
parent
5216e36fbf
commit
39d6239b4b
@ -191,7 +191,6 @@ dependencies {
|
|||||||
//规则相关
|
//规则相关
|
||||||
implementation('org.jsoup:jsoup:1.15.4')
|
implementation('org.jsoup:jsoup:1.15.4')
|
||||||
implementation('com.jayway.jsonpath:json-path:2.7.0')
|
implementation('com.jayway.jsonpath:json-path:2.7.0')
|
||||||
implementation('cn.wanghaomiao:JsoupXpath:2.5.2')
|
|
||||||
implementation(project(path: ':epublib'))
|
implementation(project(path: ':epublib'))
|
||||||
|
|
||||||
//JS rhino
|
//JS rhino
|
||||||
|
@ -2,7 +2,7 @@
|
|||||||
{
|
{
|
||||||
"name": "百度汉语",
|
"name": "百度汉语",
|
||||||
"urlRule": "https://dict.baidu.com/s?wd={{key}}",
|
"urlRule": "https://dict.baidu.com/s?wd={{key}}",
|
||||||
"showRule": "@js:var jsoup = org.jsoup.Jsoup.parse(result)\njsoup.select(\"script\").remove()\njsoup.select(\"#word-header\").remove()\njsoup.select(\"#term-header\").remove()\njsoup.select(\".more-button\").remove()\njsoup.select(\".disactive\").remove()\njsoup.select(\"#download-wrapper\").remove()\njsoup.select(\"#right-panel\").remove()\njsoup.select(\"#content-panel\").html()",
|
"showRule": "@js:var jsoup = org.jsoup.Jsoup.parse(result)\njsoup.select(\"script\").remove()\njsoup.select(\"#word-header\").remove()\njsoup.select(\"#term-header\").remove()\njsoup.select(\".more-button\").remove()\njsoup.select(\".disactive\").remove()\njsoup.select(\"#download-wrapper\").remove()\njsoup.select(\"#upload-dialog\").remove()\njsoup.select(\"#right-panel\").remove()\njsoup.select(\"#content-panel\").html()",
|
||||||
"enabled": true,
|
"enabled": true,
|
||||||
"sortNumber": 0
|
"sortNumber": 0
|
||||||
},
|
},
|
||||||
|
@ -6,7 +6,6 @@ import org.jsoup.nodes.Element
|
|||||||
import org.jsoup.select.Collector
|
import org.jsoup.select.Collector
|
||||||
import org.jsoup.select.Elements
|
import org.jsoup.select.Elements
|
||||||
import org.jsoup.select.Evaluator
|
import org.jsoup.select.Evaluator
|
||||||
import org.seimicrawler.xpath.JXNode
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Created by GKF on 2018/1/25.
|
* Created by GKF on 2018/1/25.
|
||||||
@ -14,20 +13,16 @@ import org.seimicrawler.xpath.JXNode
|
|||||||
*/
|
*/
|
||||||
@Keep
|
@Keep
|
||||||
class AnalyzeByJSoup(doc: Any) {
|
class AnalyzeByJSoup(doc: Any) {
|
||||||
companion object {
|
|
||||||
|
|
||||||
fun parse(doc: Any): Element {
|
|
||||||
return when (doc) {
|
|
||||||
is Element -> doc
|
|
||||||
is JXNode -> if (doc.isElement) doc.asElement() else Jsoup.parse(doc.toString())
|
|
||||||
else -> Jsoup.parse(doc.toString())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
private var element: Element = parse(doc)
|
private var element: Element = parse(doc)
|
||||||
|
|
||||||
|
private fun parse(doc: Any): Element {
|
||||||
|
return when (doc) {
|
||||||
|
is Element -> doc
|
||||||
|
else -> Jsoup.parse(doc.toString())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 获取列表
|
* 获取列表
|
||||||
*/
|
*/
|
||||||
@ -470,7 +465,6 @@ class AnalyzeByJSoup(doc: Any) {
|
|||||||
l = "" //清空
|
l = "" //清空
|
||||||
curMinus = false //重置
|
curMinus = false //重置
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
split = ' '
|
split = ' '
|
||||||
|
@ -3,62 +3,34 @@ package io.legado.app.model.analyzeRule
|
|||||||
import android.text.TextUtils
|
import android.text.TextUtils
|
||||||
import androidx.annotation.Keep
|
import androidx.annotation.Keep
|
||||||
import org.jsoup.Jsoup
|
import org.jsoup.Jsoup
|
||||||
import org.jsoup.nodes.Document
|
|
||||||
import org.jsoup.nodes.Element
|
import org.jsoup.nodes.Element
|
||||||
import org.jsoup.parser.Parser
|
import org.jsoup.nodes.TextNode
|
||||||
import org.jsoup.select.Elements
|
import org.jsoup.select.Elements
|
||||||
import org.seimicrawler.xpath.JXDocument
|
|
||||||
import org.seimicrawler.xpath.JXNode
|
|
||||||
|
|
||||||
@Keep
|
@Keep
|
||||||
class AnalyzeByXPath(doc: Any) {
|
class AnalyzeByXPath(doc: Any) {
|
||||||
private var jxNode: Any = parse(doc)
|
|
||||||
|
|
||||||
private fun parse(doc: Any): Any {
|
private var element: Element = parse(doc)
|
||||||
|
|
||||||
|
private fun parse(doc: Any): Element {
|
||||||
return when (doc) {
|
return when (doc) {
|
||||||
is JXNode -> if (doc.isElement) doc else strToJXDocument(doc.toString())
|
is Element -> doc
|
||||||
is Document -> JXDocument.create(doc)
|
else -> Jsoup.parse(doc.toString())
|
||||||
is Element -> JXDocument.create(Elements(doc))
|
|
||||||
is Elements -> JXDocument.create(doc)
|
|
||||||
else -> strToJXDocument(doc.toString())
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private fun strToJXDocument(html: String): JXDocument {
|
internal fun getElements(xPath: String): Elements? {
|
||||||
var html1 = html
|
|
||||||
if (html1.endsWith("</td>")) {
|
|
||||||
html1 = "<tr>${html1}</tr>"
|
|
||||||
}
|
|
||||||
if (html1.endsWith("</tr>") || html1.endsWith("</tbody>")) {
|
|
||||||
html1 = "<table>${html1}</table>"
|
|
||||||
}
|
|
||||||
if (html1.trim().startsWith("<?xml", true)) {
|
|
||||||
return JXDocument.create(Jsoup.parse(html1, Parser.xmlParser()))
|
|
||||||
}
|
|
||||||
return JXDocument.create(html1)
|
|
||||||
}
|
|
||||||
|
|
||||||
private fun getResult(xPath: String): List<JXNode>? {
|
|
||||||
val node = jxNode
|
|
||||||
return if (node is JXNode) {
|
|
||||||
node.sel(xPath)
|
|
||||||
} else {
|
|
||||||
(node as JXDocument).selN(xPath)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
internal fun getElements(xPath: String): List<JXNode>? {
|
|
||||||
|
|
||||||
if (xPath.isEmpty()) return null
|
if (xPath.isEmpty()) return null
|
||||||
|
|
||||||
val jxNodes = ArrayList<JXNode>()
|
val jxNodes = Elements()
|
||||||
val ruleAnalyzes = RuleAnalyzer(xPath)
|
val ruleAnalyzes = RuleAnalyzer(xPath)
|
||||||
val rules = ruleAnalyzes.splitRule("&&", "||", "%%")
|
val rules = ruleAnalyzes.splitRule("&&", "||", "%%")
|
||||||
|
|
||||||
if (rules.size == 1) {
|
if (rules.size == 1) {
|
||||||
return getResult(rules[0])
|
return element.selectXpath(rules[0])
|
||||||
} else {
|
} else {
|
||||||
val results = ArrayList<List<JXNode>>()
|
val results = ArrayList<Elements>()
|
||||||
for (rl in rules) {
|
for (rl in rules) {
|
||||||
val temp = getElements(rl)
|
val temp = getElements(rl)
|
||||||
if (temp != null && temp.isNotEmpty()) {
|
if (temp != null && temp.isNotEmpty()) {
|
||||||
@ -94,8 +66,8 @@ class AnalyzeByXPath(doc: Any) {
|
|||||||
val rules = ruleAnalyzes.splitRule("&&", "||", "%%")
|
val rules = ruleAnalyzes.splitRule("&&", "||", "%%")
|
||||||
|
|
||||||
if (rules.size == 1) {
|
if (rules.size == 1) {
|
||||||
getResult(xPath)?.map {
|
element.selectXpath(xPath).forEach {
|
||||||
result.add(it.asString())
|
result.add(it.toString())
|
||||||
}
|
}
|
||||||
return result
|
return result
|
||||||
} else {
|
} else {
|
||||||
@ -132,10 +104,22 @@ class AnalyzeByXPath(doc: Any) {
|
|||||||
val ruleAnalyzes = RuleAnalyzer(rule)
|
val ruleAnalyzes = RuleAnalyzer(rule)
|
||||||
val rules = ruleAnalyzes.splitRule("&&", "||")
|
val rules = ruleAnalyzes.splitRule("&&", "||")
|
||||||
if (rules.size == 1) {
|
if (rules.size == 1) {
|
||||||
getResult(rule)?.let {
|
val xpath = when {
|
||||||
return TextUtils.join("\n", it)
|
rule.startsWith("///") -> ".${rule.substring(1)}"
|
||||||
|
rule.startsWith("/") -> ".$rule"
|
||||||
|
else -> rule
|
||||||
|
}
|
||||||
|
val x = xpath.substringAfterLast("/")
|
||||||
|
return if (x.startsWith("@")) {
|
||||||
|
element.selectXpath(xpath.substringBeforeLast("/"))
|
||||||
|
.eachAttr(x.substring(1)).let {
|
||||||
|
TextUtils.join("\n", it)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
element.selectXpath(xpath, TextNode::class.java).let {
|
||||||
|
TextUtils.join("\n", it)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return null
|
|
||||||
} else {
|
} else {
|
||||||
val textList = arrayListOf<String>()
|
val textList = arrayListOf<String>()
|
||||||
for (rl in rules) {
|
for (rl in rules) {
|
||||||
|
@ -458,7 +458,8 @@ class AnalyzeRule(
|
|||||||
mode = Mode.Json
|
mode = Mode.Json
|
||||||
ruleStr
|
ruleStr
|
||||||
}
|
}
|
||||||
ruleStr.startsWith("/") -> {//XPath特征很明显,无需配置单独的识别标头
|
ruleStr.startsWith("/") || ruleStr.startsWith("./") -> {
|
||||||
|
//XPath特征很明显,无需配置单独的识别标头
|
||||||
mode = Mode.XPath
|
mode = Mode.XPath
|
||||||
ruleStr
|
ruleStr
|
||||||
}
|
}
|
||||||
@ -603,6 +604,7 @@ class AnalyzeRule(
|
|||||||
|| ruleStr.startsWith("$.")
|
|| ruleStr.startsWith("$.")
|
||||||
|| ruleStr.startsWith("$[")
|
|| ruleStr.startsWith("$[")
|
||||||
|| ruleStr.startsWith("//")
|
|| ruleStr.startsWith("//")
|
||||||
|
|| ruleStr.startsWith("./")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user