From ed78b253ffb3173c2efd5b5444b06e7ee9ac0a4d Mon Sep 17 00:00:00 2001 From: Paul Makles Date: Tue, 1 Oct 2024 21:11:08 +0100 Subject: [PATCH] feat(services/january): website embed generation --- Cargo.lock | 347 ++++++++++++++++++- crates/core/models/src/v0/embeds.rs | 35 +- crates/services/january/Cargo.toml | 6 + crates/services/january/src/api.rs | 10 +- crates/services/january/src/main.rs | 1 + crates/services/january/src/requests.rs | 188 +++++++--- crates/services/january/src/website_embed.rs | 304 ++++++++++++++++ 7 files changed, 814 insertions(+), 77 deletions(-) create mode 100644 crates/services/january/src/website_embed.rs diff --git a/Cargo.lock b/Cargo.lock index 742393001..63f918891 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -111,6 +111,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011" dependencies = [ "cfg-if", + "getrandom", "once_cell", "version_check", "zerocopy", @@ -324,9 +325,9 @@ dependencies = [ [[package]] name = "async-recursion" -version = "1.0.4" +version = "1.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0e97ce7de6cf12de5d7226c73f5ba9811622f4db3a5b91b55c53e987e5f91cba" +checksum = "3b43422f69d8ff38f95f1b2bb76517c91589a924d1559a0e935d7c8ce0274c11" dependencies = [ "proc-macro2", "quote 1.0.37", @@ -1646,6 +1647,29 @@ dependencies = [ "typenum", ] +[[package]] +name = "cssparser" +version = "0.31.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b3df4f93e5fbbe73ec01ec8d3f68bba73107993a5b1e7519273c32db9b0d5be" +dependencies = [ + "cssparser-macros", + "dtoa-short", + "itoa", + "phf 0.11.2", + "smallvec", +] + +[[package]] +name = "cssparser-macros" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13b588ba4ac1a99f7f2964d24b3d896ddc6bf847ee3855dbd4366f058cfcd331" +dependencies = [ + "quote 1.0.37", + "syn 2.0.76", +] + [[package]] name = "ct-codecs" version = "1.1.1" @@ -1904,6 +1928,17 @@ dependencies = [ "syn 1.0.107", ] +[[package]] +name = "derive_more" +version = "0.99.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f33878137e4dafd7fa914ad4e259e18a4e8e532b9617a2d0150262bf53abfce" +dependencies = [ + "proc-macro2", + "quote 1.0.37", + "syn 2.0.76", +] + [[package]] name = "devise" version = "0.3.1" @@ -1970,6 +2005,21 @@ version = "0.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "77c90badedccf4105eca100756a0b1289e191f6fcbdadd3cee1d2f614f97da8f" +[[package]] +name = "dtoa" +version = "1.0.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dcbb2bf8e87535c23f7a8a321e364ce21462d0ff10cb6407820e8e96dfff6653" + +[[package]] +name = "dtoa-short" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd1511a7b6a56299bd043a9c167a6d2bfb37bf84a6dfceaba651168adfb43c87" +dependencies = [ + "dtoa", +] + [[package]] name = "dyn-clone" version = "1.0.5" @@ -2016,6 +2066,12 @@ dependencies = [ "getrandom", ] +[[package]] +name = "ego-tree" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "12a0bb14ac04a9fcf170d0bbbef949b44cc492f4452bd20c095636956f653642" + [[package]] name = "either" version = "1.6.1" @@ -2055,9 +2111,9 @@ dependencies = [ [[package]] name = "encoding_rs" -version = "0.8.31" +version = "0.8.34" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9852635589dc9f9ea1b6fe9f05b50ef208c85c834a562f0c6abb1c475736ec2b" +checksum = "b45de904aa0b010bce2ab45264d0631681847fa7b6f2eaa7dab7619943bc4f59" dependencies = [ "cfg-if", ] @@ -2355,6 +2411,16 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a06f77d526c1a601b7c4cdd98f54b5eaabffc14d5f2f0296febdc7f357c6d3ba" +[[package]] +name = "futf" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df420e2e84819663797d1ec6544b13c5be84629e7bb00dc960d6917db2987843" +dependencies = [ + "mac", + "new_debug_unreachable", +] + [[package]] name = "futures" version = "0.3.21" @@ -2476,6 +2542,15 @@ dependencies = [ "slab", ] +[[package]] +name = "fxhash" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c" +dependencies = [ + "byteorder", +] + [[package]] name = "generator" version = "0.7.0" @@ -2508,6 +2583,15 @@ dependencies = [ "version_check", ] +[[package]] +name = "getopts" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14dbbfd5c71d70241ecf9e6f13737f7b5ce823821063188d7e46c41d371eebd5" +dependencies = [ + "unicode-width", +] + [[package]] name = "getrandom" version = "0.2.14" @@ -2811,6 +2895,20 @@ dependencies = [ "winapi", ] +[[package]] +name = "html5ever" +version = "0.27.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c13771afe0e6e846f1e67d038d4cb29998a6779f93c809212e4e9c32efd244d4" +dependencies = [ + "log", + "mac", + "markup5ever", + "proc-macro2", + "quote 1.0.37", + "syn 2.0.76", +] + [[package]] name = "http" version = "0.2.12" @@ -3735,12 +3833,32 @@ version = "0.11.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9106e1d747ffd48e6be5bb2d97fa706ed25b144fbee4d5c02eae110cd8d6badd" +[[package]] +name = "mac" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4" + [[package]] name = "maplit" version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3e2e65a1a2e43cfcb47a895c4c8b10d1f4a61097f9f254f183aee60cad9c651d" +[[package]] +name = "markup5ever" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "16ce3abbeba692c8b8441d036ef91aea6df8da2c6b6e21c7e14d3c18e526be45" +dependencies = [ + "log", + "phf 0.11.2", + "phf_codegen 0.11.2", + "string_cache", + "string_cache_codegen", + "tendril", +] + [[package]] name = "match_cfg" version = "0.1.0" @@ -3789,9 +3907,9 @@ dependencies = [ [[package]] name = "memchr" -version = "2.5.0" +version = "2.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" +checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" [[package]] name = "metrics" @@ -4548,6 +4666,96 @@ dependencies = [ "sha-1 0.8.2", ] +[[package]] +name = "phf" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fabbf1ead8a5bcbc20f5f8b939ee3f5b0f6f281b6ad3468b84656b658b455259" +dependencies = [ + "phf_shared 0.10.0", +] + +[[package]] +name = "phf" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ade2d8b8f33c7333b51bcf0428d37e217e9f32192ae4772156f65063b8ce03dc" +dependencies = [ + "phf_macros", + "phf_shared 0.11.2", +] + +[[package]] +name = "phf_codegen" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fb1c3a8bc4dd4e5cfce29b44ffc14bedd2ee294559a294e2a4d4c9e9a6a13cd" +dependencies = [ + "phf_generator 0.10.0", + "phf_shared 0.10.0", +] + +[[package]] +name = "phf_codegen" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e8d39688d359e6b34654d328e262234662d16cc0f60ec8dcbe5e718709342a5a" +dependencies = [ + "phf_generator 0.11.2", + "phf_shared 0.11.2", +] + +[[package]] +name = "phf_generator" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d5285893bb5eb82e6aaf5d59ee909a06a16737a8970984dd7746ba9283498d6" +dependencies = [ + "phf_shared 0.10.0", + "rand 0.8.5", +] + +[[package]] +name = "phf_generator" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48e4cc64c2ad9ebe670cb8fd69dd50ae301650392e81c05f9bfcb2d5bdbc24b0" +dependencies = [ + "phf_shared 0.11.2", + "rand 0.8.5", +] + +[[package]] +name = "phf_macros" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3444646e286606587e49f3bcf1679b8cef1dc2c5ecc29ddacaffc305180d464b" +dependencies = [ + "phf_generator 0.11.2", + "phf_shared 0.11.2", + "proc-macro2", + "quote 1.0.37", + "syn 2.0.76", +] + +[[package]] +name = "phf_shared" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6796ad771acdc0123d2a88dc428b5e38ef24456743ddb1744ed628f9815c096" +dependencies = [ + "siphasher", +] + +[[package]] +name = "phf_shared" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90fcb95eef784c2ac79119d1dd819e162b5da872ce6f3c3abe1e8ca1c082f72b" +dependencies = [ + "siphasher", +] + [[package]] name = "pin-project" version = "1.0.10" @@ -4670,6 +4878,12 @@ version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eb9f9e6e233e5c4a35559a617bf40a4ec447db2e84c20b55a6f83167b7e57872" +[[package]] +name = "precomputed-hash" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c" + [[package]] name = "pretty_env_logger" version = "0.4.0" @@ -5159,14 +5373,14 @@ dependencies = [ [[package]] name = "regex" -version = "1.9.4" +version = "1.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "12de2eff854e5fa4b1295edd650e227e9d8fb0c9e90b12e7f36d6a6811791a29" +checksum = "38200e5ee88914975b69f657f0801b6f6dccafd44fd9326302a4aaeecfacb1d8" dependencies = [ "aho-corasick", "memchr", - "regex-automata 0.3.7", - "regex-syntax 0.7.5", + "regex-automata 0.4.8", + "regex-syntax 0.8.5", ] [[package]] @@ -5180,13 +5394,13 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.3.7" +version = "0.4.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49530408a136e16e5b486e883fbb6ba058e8e4e8ae6621a77b048b314336e629" +checksum = "368758f23274712b504848e9d5a6f010445cc8b87a7cdb4d7cbee666c1288da3" dependencies = [ "aho-corasick", "memchr", - "regex-syntax 0.7.5", + "regex-syntax 0.8.5", ] [[package]] @@ -5203,9 +5417,9 @@ checksum = "49b3de9ec5dc0a3417da371aab17d729997c15010e7fd24ff707773a33bddb64" [[package]] name = "regex-syntax" -version = "0.7.5" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dbb5fb1acd8a1a18b3dd5be62d25485eb770e05afb408a9627d14d451bae12da" +checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" [[package]] name = "reqwest" @@ -5498,16 +5712,20 @@ dependencies = [ name = "revolt-january" version = "0.7.14" dependencies = [ + "async-recursion", "axum", "axum-extra", + "encoding_rs", "lazy_static", "mime", "moka", + "regex", "reqwest 0.12.4", "revolt-config", "revolt-files", "revolt-models", "revolt-result", + "scraper", "serde", "serde_json", "tempfile", @@ -6142,6 +6360,22 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" +[[package]] +name = "scraper" +version = "0.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b90460b31bfe1fc07be8262e42c665ad97118d4585869de9345a84d501a9eaf0" +dependencies = [ + "ahash 0.8.11", + "cssparser", + "ego-tree", + "getopts", + "html5ever", + "once_cell", + "selectors", + "tendril", +] + [[package]] name = "sct" version = "0.7.0" @@ -6206,6 +6440,25 @@ dependencies = [ "libc", ] +[[package]] +name = "selectors" +version = "0.25.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4eb30575f3638fc8f6815f448d50cb1a2e255b0897985c8c59f4d37b72a07b06" +dependencies = [ + "bitflags 2.6.0", + "cssparser", + "derive_more", + "fxhash", + "log", + "new_debug_unreachable", + "phf 0.10.1", + "phf_codegen 0.10.0", + "precomputed-hash", + "servo_arc", + "smallvec", +] + [[package]] name = "semver" version = "0.9.0" @@ -6446,6 +6699,15 @@ dependencies = [ "syn 1.0.107", ] +[[package]] +name = "servo_arc" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d036d71a959e00c77a63538b90a6c2390969f9772b096ea837205c6bd0491a44" +dependencies = [ + "stable_deref_trait", +] + [[package]] name = "sha-1" version = "0.8.2" @@ -6562,6 +6824,12 @@ version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f27f6278552951f1f2b8cf9da965d10969b2efdea95a6ec47987ab46edfe263a" +[[package]] +name = "siphasher" +version = "0.3.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d" + [[package]] name = "slab" version = "0.4.6" @@ -6639,6 +6907,12 @@ dependencies = [ "memchr", ] +[[package]] +name = "stable_deref_trait" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" + [[package]] name = "state" version = "0.5.3" @@ -6648,6 +6922,32 @@ dependencies = [ "loom", ] +[[package]] +name = "string_cache" +version = "0.8.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f91138e76242f575eb1d3b38b4f1362f10d3a43f47d182a5b359af488a02293b" +dependencies = [ + "new_debug_unreachable", + "once_cell", + "parking_lot", + "phf_shared 0.10.0", + "precomputed-hash", + "serde", +] + +[[package]] +name = "string_cache_codegen" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6bb30289b722be4ff74a408c3cc27edeaad656e06cb1fe8fa9231fa59c728988" +dependencies = [ + "phf_generator 0.10.0", + "phf_shared 0.10.0", + "proc-macro2", + "quote 1.0.37", +] + [[package]] name = "stringprep" version = "0.1.2" @@ -6845,6 +7145,17 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "tendril" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d24a120c5fc464a3458240ee02c299ebcb9d67b5249c8848b09d639dca8d7bb0" +dependencies = [ + "futf", + "mac", + "utf-8", +] + [[package]] name = "termcolor" version = "1.1.3" @@ -7437,6 +7748,12 @@ version = "1.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1dd624098567895118886609431a7c3b8f516e41d30e0643f03d94592a147e36" +[[package]] +name = "unicode-width" +version = "0.1.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7dd6e30e90baa6f72411720665d41d89b9a3d039dc45b8faea1ddd07f617f6af" + [[package]] name = "unicode-xid" version = "0.0.4" diff --git a/crates/core/models/src/v0/embeds.rs b/crates/core/models/src/v0/embeds.rs index 6649581a0..4ea4d5cfe 100644 --- a/crates/core/models/src/v0/embeds.rs +++ b/crates/core/models/src/v0/embeds.rs @@ -119,8 +119,6 @@ auto_derived!( #[serde(skip_serializing_if = "Option::is_none")] pub video: Option