From 73a6b95037bfd91976c813c20c6e8cb7d7bbad30 Mon Sep 17 00:00:00 2001 From: j-mendez Date: Tue, 25 Jul 2023 09:07:45 -0400 Subject: [PATCH] perf(css): improve link tree parsing nodes --- Cargo.lock | 255 +++++------------- spider/Cargo.toml | 4 +- .../packages/scraper/element_ref/element.rs | 11 +- spider/src/packages/scraper/selector.rs | 13 +- spider/src/utils.rs | 33 +-- spider/src/website.rs | 26 +- 6 files changed, 109 insertions(+), 233 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 8eb046fe9..b0b2ec186 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -114,7 +114,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2c99f64d1e06488f620f932677e24bc6e2897582980441ae90a671415bd7ec2f" dependencies = [ "cfg-if", - "getrandom 0.2.9", + "getrandom", "once_cell", "version_check", ] @@ -156,7 +156,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7c77a0045eda8b888c76ea473c2b0515ba6f471d318f8927c5c72240937035a6" dependencies = [ "android-properties", - "bitflags", + "bitflags 1.3.2", "cc", "jni-sys", "libc", @@ -456,6 +456,12 @@ version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" +[[package]] +name = "bitflags" +version = "2.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "630be753d4e58660abd17930c71b647fe46c27ea6b63cc59e1e3851406972e42" + [[package]] name = "block" version = "0.1.6" @@ -647,7 +653,7 @@ version = "2.34.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a0610544180c38b88101fecf2dd634b174a62eef6946f84dfc6a7127512b381c" dependencies = [ - "bitflags", + "bitflags 1.3.2", "textwrap 0.11.0", "unicode-width", ] @@ -659,7 +665,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4ea181bf566f71cb9a5d17a59e1871af638180a18fb0035c92ae62b705207123" dependencies = [ "atty", - "bitflags", + "bitflags 1.3.2", "clap_derive", "clap_lex", "indexmap", @@ -735,12 +741,6 @@ dependencies = [ "crossbeam-utils", ] -[[package]] -name = "convert_case" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6245d59a3e82a7fc217c5828a6692dbc6dfb63a0c8c90495621f7b9d79704a0e" - [[package]] name = "convert_case" version = "0.5.0" @@ -769,7 +769,7 @@ version = "0.22.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2581bbab3b8ffc6fcbd550bf46c355135d16e9ff2a6ea032ad6b9bf1d7efe4fb" dependencies = [ - "bitflags", + "bitflags 1.3.2", "core-foundation", "core-graphics-types", "foreign-types", @@ -782,7 +782,7 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3a68b68b3446082644c91ac778bf50cd4104bfb002b5a6a7c44cca5a2c70788b" dependencies = [ - "bitflags", + "bitflags 1.3.2", "core-foundation", "foreign-types", "libc", @@ -899,19 +899,15 @@ dependencies = [ [[package]] name = "cssparser" -version = "0.29.6" +version = "0.31.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f93d03419cb5950ccfd3daf3ff1c7a36ace64609a1a8746d493df1ca0afde0fa" +checksum = "5b3df4f93e5fbbe73ec01ec8d3f68bba73107993a5b1e7519273c32db9b0d5be" dependencies = [ "cssparser-macros", "dtoa-short", "itoa", - "matches", - "phf 0.10.1", - "proc-macro2", - "quote", + "phf 0.11.1", "smallvec", - "syn 1.0.109", ] [[package]] @@ -962,10 +958,8 @@ version = "0.99.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4fb810d30a7c1953f91334de7244731fc3f3c10d7fe163338a35b9f640960321" dependencies = [ - "convert_case 0.4.0", "proc-macro2", "quote", - "rustc_version", "syn 1.0.109", ] @@ -1297,7 +1291,7 @@ version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "15d14128f06405808ce75bfebe11e9b0f9da18719ede6d7bdb1702d6bfe0f7e8" dependencies = [ - "bitflags", + "bitflags 1.3.2", "byteorder", "num_enum", "serde", @@ -1464,17 +1458,6 @@ dependencies = [ "winapi", ] -[[package]] -name = "getrandom" -version = "0.1.16" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8fc3cb4d91f53b50155bdcfd23f6a4c39ae1969c2ae85982b135750cccaf5fce" -dependencies = [ - "cfg-if", - "libc", - "wasi 0.9.0+wasi-snapshot-preview1", -] - [[package]] name = "getrandom" version = "0.2.9" @@ -1483,7 +1466,7 @@ checksum = "c85e1d9ab2eadba7e5040d4e09cbd6d072b76a557ad64e797c2cb9d4da21d7e4" dependencies = [ "cfg-if", "libc", - "wasi 0.11.0+wasi-snapshot-preview1", + "wasi", ] [[package]] @@ -1521,7 +1504,7 @@ version = "0.30.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "62f9b771a65f0a1e3ddb6aa16f867d87dc73c922411c255e6c4ab7f6d45c7327" dependencies = [ - "bitflags", + "bitflags 1.3.2", "cfg_aliases", "cgl", "core-foundation", @@ -1643,7 +1626,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f3e372db8e5c0d213e0cd0b9be18be2aca3d44cf2fe30a9d46a65581cd454584" dependencies = [ "base64 0.13.1", - "bitflags", + "bitflags 1.3.2", "bytes", "headers-core", "http", @@ -1712,7 +1695,7 @@ version = "0.5.27" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b0e5190458ce24b51fd95e34be40d23b8ec15e1d6cef76a8f8f321744efefc8" dependencies = [ - "convert_case 0.5.0", + "convert_case", "lazy_static", ] @@ -2076,7 +2059,7 @@ checksum = "927a765cd3fc26206e66b296465fa9d3e5ab003e651c1b3c060e7956d96b19d2" dependencies = [ "libc", "log", - "wasi 0.11.0+wasi-snapshot-preview1", + "wasi", "windows-sys 0.48.0", ] @@ -2128,7 +2111,7 @@ version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "451422b7e4718271c8b5b3aadf5adedba43dc76312454b387e98fae0fc951aa0" dependencies = [ - "bitflags", + "bitflags 1.3.2", "jni-sys", "ndk-sys", "num_enum", @@ -2163,7 +2146,7 @@ version = "0.24.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fa52e972a9a719cecb6864fb88568781eb706bac2cd1d4f04a648542dbf78069" dependencies = [ - "bitflags", + "bitflags 1.3.2", "cfg-if", "libc", "memoffset 0.6.5", @@ -2176,7 +2159,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f346ff70e7dbfd675fe90590b92d59ef2de15a8779ae305ebcbfd3f0caf59be4" dependencies = [ "autocfg", - "bitflags", + "bitflags 1.3.2", "cfg-if", "libc", "memoffset 0.6.5", @@ -2188,19 +2171,13 @@ version = "0.26.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bfdda3d196821d6af13126e40375cdf7da646a96114af134d5f417a9a1dc8e1a" dependencies = [ - "bitflags", + "bitflags 1.3.2", "cfg-if", "libc", "memoffset 0.7.1", "static_assertions", ] -[[package]] -name = "nodrop" -version = "0.1.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72ef4a56884ca558e5ddb05a1d1e7e1bfd9a68d9ed024c21704cc98872dae1bb" - [[package]] name = "nohash-hasher" version = "0.2.0" @@ -2339,7 +2316,7 @@ version = "0.10.54" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "69b3f656a17a6cbc115b5c7a40c616947d213ba182135b014d6051b73ab6f019" dependencies = [ - "bitflags", + "bitflags 1.3.2", "cfg-if", "foreign-types", "libc", @@ -2452,24 +2429,13 @@ version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "478c572c3d73181ff3c2539045f6eb99e5491218eae919370993b890cdbdd98e" -[[package]] -name = "phf" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3dfb61232e34fcb633f43d12c58f83c1df82962dcdfa565a4e866ffc17dafe12" -dependencies = [ - "phf_shared 0.8.0", -] - [[package]] name = "phf" version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fabbf1ead8a5bcbc20f5f8b939ee3f5b0f6f281b6ad3468b84656b658b455259" dependencies = [ - "phf_macros", "phf_shared 0.10.0", - "proc-macro-hack", ] [[package]] @@ -2478,17 +2444,18 @@ version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "928c6535de93548188ef63bb7c4036bd415cd8f36ad25af44b9789b2ee72a48c" dependencies = [ + "phf_macros", "phf_shared 0.11.1", ] [[package]] name = "phf_codegen" -version = "0.8.0" +version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cbffee61585b0411840d3ece935cce9cb6321f01c45477d30066498cd5e1a815" +checksum = "4fb1c3a8bc4dd4e5cfce29b44ffc14bedd2ee294559a294e2a4d4c9e9a6a13cd" dependencies = [ - "phf_generator 0.8.0", - "phf_shared 0.8.0", + "phf_generator 0.10.0", + "phf_shared 0.10.0", ] [[package]] @@ -2501,16 +2468,6 @@ dependencies = [ "phf_shared 0.11.1", ] -[[package]] -name = "phf_generator" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "17367f0cc86f2d25802b2c26ee58a7b23faeccf78a396094c13dced0d0182526" -dependencies = [ - "phf_shared 0.8.0", - "rand 0.7.3", -] - [[package]] name = "phf_generator" version = "0.10.0" @@ -2518,7 +2475,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5d5285893bb5eb82e6aaf5d59ee909a06a16737a8970984dd7746ba9283498d6" dependencies = [ "phf_shared 0.10.0", - "rand 0.8.5", + "rand", ] [[package]] @@ -2528,32 +2485,22 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b1181c94580fa345f50f19d738aaa39c0ed30a600d95cb2d3e23f94266f14fbf" dependencies = [ "phf_shared 0.11.1", - "rand 0.8.5", + "rand", ] [[package]] name = "phf_macros" -version = "0.10.0" +version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "58fdf3184dd560f160dd73922bea2d5cd6e8f064bf4b13110abd81b03697b4e0" +checksum = "92aacdc5f16768709a569e913f7451034034178b05bdc8acda226659a3dccc66" dependencies = [ - "phf_generator 0.10.0", - "phf_shared 0.10.0", - "proc-macro-hack", + "phf_generator 0.11.1", + "phf_shared 0.11.1", "proc-macro2", "quote", "syn 1.0.109", ] -[[package]] -name = "phf_shared" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c00cf8b9eafe68dde5e9eaa2cef8ee84a9336a47d566ec55ca16589633b65af7" -dependencies = [ - "siphasher", -] - [[package]] name = "phf_shared" version = "0.10.0" @@ -2644,7 +2591,7 @@ version = "0.17.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "aaeebc51f9e7d2c150d3f3bfeb667f2aa985db5ef1e3d212847bdedb488beeaa" dependencies = [ - "bitflags", + "bitflags 1.3.2", "crc32fast", "fdeflate", "flate2", @@ -2658,7 +2605,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4b2d323e8ca7996b3e23126511a523f7e62924d93ecd5ae73b333815b0eb3dce" dependencies = [ "autocfg", - "bitflags", + "bitflags 1.3.2", "cfg-if", "concurrent-queue", "libc", @@ -2713,12 +2660,6 @@ dependencies = [ "version_check", ] -[[package]] -name = "proc-macro-hack" -version = "0.5.20+deprecated" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068" - [[package]] name = "proc-macro2" version = "1.0.63" @@ -2770,20 +2711,6 @@ dependencies = [ "proc-macro2", ] -[[package]] -name = "rand" -version = "0.7.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03" -dependencies = [ - "getrandom 0.1.16", - "libc", - "rand_chacha 0.2.2", - "rand_core 0.5.1", - "rand_hc", - "rand_pcg", -] - [[package]] name = "rand" version = "0.8.5" @@ -2791,18 +2718,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" dependencies = [ "libc", - "rand_chacha 0.3.1", - "rand_core 0.6.4", -] - -[[package]] -name = "rand_chacha" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f4c8ed856279c9737206bf725bf36935d8666ead7aa69b52be55af369d193402" -dependencies = [ - "ppv-lite86", - "rand_core 0.5.1", + "rand_chacha", + "rand_core", ] [[package]] @@ -2812,16 +2729,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" dependencies = [ "ppv-lite86", - "rand_core 0.6.4", -] - -[[package]] -name = "rand_core" -version = "0.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19" -dependencies = [ - "getrandom 0.1.16", + "rand_core", ] [[package]] @@ -2830,25 +2738,7 @@ version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" dependencies = [ - "getrandom 0.2.9", -] - -[[package]] -name = "rand_hc" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ca3129af7b92a17112d59ad498c6f81eaf463253766b90396d39ea7a39d6613c" -dependencies = [ - "rand_core 0.5.1", -] - -[[package]] -name = "rand_pcg" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "16abd0c1b639e9eb4d7c50c0b8100b0d0f849be2349829c740fe8e6eb4816429" -dependencies = [ - "rand_core 0.5.1", + "getrandom", ] [[package]] @@ -2885,7 +2775,7 @@ version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a" dependencies = [ - "bitflags", + "bitflags 1.3.2", ] [[package]] @@ -2894,7 +2784,7 @@ version = "0.3.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "567664f262709473930a4bf9e51bf2ebf3348f2e748ccc50dea20646858f8f29" dependencies = [ - "bitflags", + "bitflags 1.3.2", ] [[package]] @@ -3028,22 +2918,13 @@ version = "0.1.23" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d626bb9dae77e28219937af045c257c28bfd3f69333c512553507f5f9798cb76" -[[package]] -name = "rustc_version" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bfa0f585226d2e68097d4f95d113b15b83a82e819ab25717ec0590d9584ef366" -dependencies = [ - "semver", -] - [[package]] name = "rustix" version = "0.37.19" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "acf8729d8542766f1b2cf77eb034d52f40d375bb8b615d0b147089946e16613d" dependencies = [ - "bitflags", + "bitflags 1.3.2", "errno", "io-lifetimes", "libc", @@ -3154,7 +3035,7 @@ version = "2.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1fc758eb7bffce5b308734e9b0c1468893cae9ff70ebf13e7090be8dcbcc83a8" dependencies = [ - "bitflags", + "bitflags 1.3.2", "core-foundation", "core-foundation-sys", "libc", @@ -3173,28 +3054,23 @@ dependencies = [ [[package]] name = "selectors" -version = "0.24.0" +version = "0.25.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c37578180969d00692904465fb7f6b3d50b9a2b952b87c23d0e2e5cb5013416" +checksum = "4eb30575f3638fc8f6815f448d50cb1a2e255b0897985c8c59f4d37b72a07b06" dependencies = [ - "bitflags", + "bitflags 2.3.3", "cssparser", "derive_more", "fxhash", "log", - "phf 0.8.0", - "phf_codegen 0.8.0", + "new_debug_unreachable", + "phf 0.10.1", + "phf_codegen 0.10.0", "precomputed-hash", "servo_arc", "smallvec", ] -[[package]] -name = "semver" -version = "1.0.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bebd363326d05ec3e2f532ab7660680f3b02130d780c299bca73469d521bc0ed" - [[package]] name = "serde" version = "1.0.167" @@ -3273,11 +3149,10 @@ dependencies = [ [[package]] name = "servo_arc" -version = "0.2.0" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d52aa42f8fdf0fed91e5ce7f23d8138441002fa31dca008acf47e6fd4721f741" +checksum = "d036d71a959e00c77a63538b90a6c2390969f9772b096ea837205c6bd0491a44" dependencies = [ - "nodrop", "stable_deref_trait", ] @@ -3353,7 +3228,7 @@ version = "0.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f307c47d32d2715eb2e0ece5589057820e0e5e70d07c247d1063e844e107f454" dependencies = [ - "bitflags", + "bitflags 1.3.2", "calloop", "dlib", "lazy_static", @@ -3436,7 +3311,7 @@ dependencies = [ name = "spider_examples" version = "1.34.2" dependencies = [ - "convert_case 0.5.0", + "convert_case", "env_logger 0.9.3", "flexbuffers", "htr", @@ -3916,7 +3791,7 @@ dependencies = [ "http", "httparse", "log", - "rand 0.8.5", + "rand", "sha1", "thiserror", "url", @@ -4168,12 +4043,6 @@ dependencies = [ "tracing", ] -[[package]] -name = "wasi" -version = "0.9.0+wasi-snapshot-preview1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519" - [[package]] name = "wasi" version = "0.11.0+wasi-snapshot-preview1" @@ -4265,7 +4134,7 @@ version = "0.29.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f3b068c05a039c9f755f881dc50f01732214f5685e379829759088967c46715" dependencies = [ - "bitflags", + "bitflags 1.3.2", "downcast-rs", "libc", "nix 0.24.3", @@ -4304,7 +4173,7 @@ version = "0.29.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b950621f9354b322ee817a23474e479b34be96c2e909c14f7bc0100e9a970bc6" dependencies = [ - "bitflags", + "bitflags 1.3.2", "wayland-client", "wayland-commons", "wayland-scanner", @@ -4611,7 +4480,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "866db3f712fffba75d31bf0cdecf357c8aeafd158c5b7ab51dba2a2b2d47f196" dependencies = [ "android-activity", - "bitflags", + "bitflags 1.3.2", "cfg_aliases", "core-foundation", "core-graphics", @@ -4741,7 +4610,7 @@ dependencies = [ "nix 0.26.2", "once_cell", "ordered-stream", - "rand 0.8.5", + "rand", "serde", "serde-xml-rs", "serde_repr", diff --git a/spider/Cargo.toml b/spider/Cargo.toml index 0c044b167..c55814099 100644 --- a/spider/Cargo.toml +++ b/spider/Cargo.toml @@ -28,11 +28,11 @@ string_concat = "0.0.1" lazy_static = "1.4.0" compact_str = "0.7.0" fast_html5ever = "0.26.1" -selectors = "0.24.0" +selectors = "0.25.0" tendril = "0.4.3" ahash = "0.8.3" matches = "0.1.10" -cssparser = "0.29.6" +cssparser = "0.31.2" smallvec = "1.10.0" ego-tree = "0.6.2" num_cpus = "1.15.0" diff --git a/spider/src/packages/scraper/element_ref/element.rs b/spider/src/packages/scraper/element_ref/element.rs index ae141e07e..77bb3a0a9 100644 --- a/spider/src/packages/scraper/element_ref/element.rs +++ b/spider/src/packages/scraper/element_ref/element.rs @@ -26,6 +26,14 @@ impl<'a> Element for ElementRef<'a> { None } + fn first_element_child(&self) -> Option { + self.prev_siblings().nth(0).and_then(ElementRef::wrap) + } + + fn apply_selector_flags(&self, _: selectors::matching::ElementSelectorFlags) { + // Apply selector flags when enabled + } + fn is_pseudo_element(&self) -> bool { false } @@ -80,11 +88,10 @@ impl<'a> Element for ElementRef<'a> { }) } - fn match_non_ts_pseudo_class( + fn match_non_ts_pseudo_class( &self, _pc: &NonTSPseudoClass, _context: &mut matching::MatchingContext, - _flags_setter: &mut F, ) -> bool { false } diff --git a/spider/src/packages/scraper/selector.rs b/spider/src/packages/scraper/selector.rs index e5efbac63..3903bc0e2 100644 --- a/spider/src/packages/scraper/selector.rs +++ b/spider/src/packages/scraper/selector.rs @@ -5,7 +5,7 @@ use crate::packages::scraper::error::SelectorErrorKind; use cssparser::ToCss; use fast_html5ever::{LocalName, Namespace}; use selectors::parser::SelectorParseErrorKind; -use selectors::{matching, parser}; +use selectors::{matching, parser, NthIndexCache}; use smallvec::SmallVec; use std::convert::TryFrom; use std::fmt; @@ -26,7 +26,7 @@ impl Selector { let mut parser_input = cssparser::ParserInput::new(selectors); let mut parser = cssparser::Parser::new(&mut parser_input); - parser::SelectorList::parse(&Parser, &mut parser) + parser::SelectorList::parse(&Parser, &mut parser, parser::ParseRelative::No) .map(|list| Selector { selectors: list.0 }) .map_err(SelectorErrorKind::from) } @@ -40,16 +40,19 @@ impl Selector { /// The optional `scope` argument is used to specify which element has `:scope` pseudo-class. /// When it is `None`, `:scope` will match the root element. pub fn matches_with_scope(&self, element: &ElementRef, scope: Option) -> bool { + let mut binding = NthIndexCache::default(); let mut context = matching::MatchingContext::new( matching::MatchingMode::Normal, None, - None, + &mut binding, matching::QuirksMode::NoQuirks, + matching::NeedsSelectorFlags::No, + matching::IgnoreNthChildForInvalidation::No ); context.scope_element = scope.map(|x| selectors::Element::opaque(&x)); self.selectors .iter() - .any(|s| matching::matches_selector(s, 0, None, element, &mut context, &mut |_, _| {})) + .any(|s| matching::matches_selector(s, 0, None, element, &mut context)) } } @@ -77,7 +80,7 @@ impl parser::SelectorImpl for Simple { type PseudoElement = PseudoElement; // see: https://github.com/servo/servo/pull/19747#issuecomment-357106065 - type ExtraMatchingData = String; + type ExtraMatchingData<'a> = std::marker::PhantomData<&'a ()>; } /// Wraps [`String`] so that it can be used with [`selectors`] diff --git a/spider/src/utils.rs b/spider/src/utils.rs index 9e23eefca..7e9ba731d 100644 --- a/spider/src/utils.rs +++ b/spider/src/utils.rs @@ -193,31 +193,34 @@ lazy_static! { pub async fn pause(domain: &str) { let s = CONTROLLER.clone(); - s.lock() - .await - .0 - .send((domain.to_string(), Handler::Pause)) - .unwrap(); + match s.lock().await.0.send((domain.into(), Handler::Pause)) { + _ => (), + }; } /// resume a target website crawl pub async fn resume(domain: &str) { let s = CONTROLLER.clone(); - s.lock() - .await - .0 - .send((domain.to_string(), Handler::Resume)) - .unwrap(); + match s.lock().await.0.send((domain.into(), Handler::Resume)) { + _ => (), + }; } /// shutdown a target website crawl pub async fn shutdown(domain: &str) { let s = CONTROLLER.clone(); - s.lock() - .await - .0 - .send((domain.to_string(), Handler::Shutdown)) - .unwrap(); + match s.lock().await.0.send((domain.into(), Handler::Shutdown)) { + _ => (), + }; +} + +/// reset a target website crawl +pub async fn reset(domain: &str) { + let s = CONTROLLER.clone(); + + match s.lock().await.0.send((domain.into(), Handler::Start)) { + _ => (), + }; } diff --git a/spider/src/website.rs b/spider/src/website.rs index 278e7fab0..7628c2941 100644 --- a/spider/src/website.rs +++ b/spider/src/website.rs @@ -433,15 +433,9 @@ impl Website { while l.changed().await.is_ok() { let n = &*l.borrow(); - let (name, rest) = n; + let (target, rest) = n; - let url = if name.ends_with('/') { - name.into() - } else { - string_concat!(name.clone(), "/") - }; - - if domain.eq_ignore_ascii_case(&url) { + if domain.eq_ignore_ascii_case(&target) { if rest == &Handler::Resume { paused.store(0, Ordering::Relaxed); } @@ -1163,7 +1157,7 @@ async fn crawl_invalid() { let mut website: Website = Website::new(domain); website.crawl().await; let mut uniq: Box> = Box::new(HashSet::new()); - uniq.insert(format!("{}/", domain.to_string()).into()); // TODO: remove trailing slash mutate + uniq.insert(format!("{}", domain.to_string()).into()); // TODO: remove trailing slash mutate assert_eq!(website.links_visited, uniq); // only the target url should exist } @@ -1401,16 +1395,16 @@ async fn test_link_duplicates() { async fn test_crawl_pause_resume() { use crate::utils::{pause, resume}; - let url = "https://choosealicense.com/"; - let mut website: Website = Website::new(&url); + let domain = "https://choosealicense.com/"; + let mut website: Website = Website::new(&domain); let start = tokio::time::Instant::now(); tokio::spawn(async move { - pause(url).await; + pause(domain).await; // static website test pause/resume - scan will never take longer than 5secs for target website choosealicense tokio::time::sleep(Duration::from_millis(5000)).await; - resume(url).await; + resume(domain).await; }); website.crawl().await; @@ -1435,11 +1429,11 @@ async fn test_crawl_shutdown() { use crate::utils::shutdown; // use target blog to prevent shutdown of prior crawler - let url = "https://rsseau.fr/"; - let mut website: Website = Website::new(&url); + let domain = "https://rsseau.fr/"; + let mut website: Website = Website::new(&domain); tokio::spawn(async move { - shutdown(url).await; + shutdown(domain).await; }); website.crawl().await;