Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat/index reusable content #69

Merged
merged 25 commits into from
Sep 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
ad7c03c
fix(Lucene.Core): upgrade to new version of Kentico, webpage events w…
bkapustik Aug 15, 2024
e7a0a0c
feat(Lucene.Core): Mapping reusable items
bkapustik Aug 16, 2024
ff88d5f
feat(docs): add reusable content types
bkapustik Aug 18, 2024
f90559b
fix(Lucene.Admin): wrong placeholder in content type selection
bkapustik Aug 18, 2024
729ec2c
fix(Lucene.Admin): reusable content provider comment
bkapustik Aug 18, 2024
8c9390c
feat(docs): reusable content types images
bkapustik Aug 18, 2024
483b408
feat(Lucene.Core): indexing reusable content example strategy
bkapustik Aug 19, 2024
9f72ae1
feat(docs): indexing reusable content
bkapustik Aug 19, 2024
c1b46e4
fix(docs): remove invalid images
bkapustik Aug 19, 2024
6843ac2
Add files via upload
bkapustik Aug 19, 2024
de73fc8
mrak .jpg as binary in .gitattributes
Lahvac Aug 19, 2024
f57d265
feat(Lucene.Core): indexing reusable content example
bkapustik Aug 19, 2024
f91229a
Merge branch 'feat/index-reusable-content' of https://github.com/Kent…
bkapustik Aug 19, 2024
3a48bb5
feat(Lucene.Core): indexing reusable content do not execute rebuild q…
bkapustik Aug 19, 2024
9174ab5
feat(Lucene.Admin): update packages
bkapustik Aug 19, 2024
88701c6
Merge master
bkapustik Aug 20, 2024
e465264
Merge branch 'main' into feat/index-reusable-content
bkapustik Aug 20, 2024
685742f
fix(docs): typo
bkapustik Aug 26, 2024
6b125a4
fix(DancingGoat): refactor string initialization with string.empty
bkapustik Aug 26, 2024
d773ec5
refactor(Lucene.Core, Admin): refactor string assignments
bkapustik Aug 26, 2024
cd8c14c
refactor(Lucene.Core): remove specific info providers
bkapustik Aug 26, 2024
9e16fb3
refactor(Lucene.Core): use iinfoproviders
bkapustik Aug 26, 2024
7a2554c
refactor(docs): change "" to string.Empty
bkapustik Aug 26, 2024
055dcce
refactor(Lucene.Core): remove redundand info provider definitions
bkapustik Aug 26, 2024
dd9edb3
refactor(DancingGoat): conditional blocks
bkapustik Aug 26, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitattributes
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
* text eol=crlf
*.png binary
*.png binary
*.jpg binary
13 changes: 7 additions & 6 deletions docs/Custom-index-strategy.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@ public class ExampleSearchIndexingStrategy : DefaultLuceneIndexingStrategy
{
var document = new Document();

string sortableTitle = "";
string title = "";
string sortableTitle = string.Empty;
string title = string.Empty;

// IIndexEventItemModel could be a reusable content item or a web page item, so we use
// pattern matching to get access to the web page item specific type and fields
Expand Down Expand Up @@ -60,7 +60,7 @@ public class ExampleSearchIndexingStrategy : DefaultLuceneIndexingStrategy

var article = page.ArticlePageArticle.FirstOrDefault();

sortableTitle = title = article?.ArticleTitle ?? "";
sortableTitle = title = article?.ArticleTitle ?? string.Empty;
}

document.Add(new TextField(nameof(GlobalSearchResultModel.Title), title, Field.Store.YES));
Expand All @@ -83,6 +83,7 @@ public static class BaseDocumentProperties
public const string ID = "ID";
public const string CONTENT_TYPE_NAME = "ContentTypeName";
// ...
// This field is only added to the document if the indexed item is a web page.
public const string URL = "Url";
}
```
Expand Down Expand Up @@ -135,9 +136,9 @@ public class ExampleSearchIndexingStrategy : DefaultLuceneIndexingStrategy
{
var document = new Document();

string sortableTitle = "";
string title = "";
string contentType = "";
string sortableTitle = string.Empty;
string title = string.Empty;
string contentType = string.Empty;

if (item is IndexEventWebPageItemModel webpageItem &&
string.Equals(indexedModel.ContentTypeName, ArticlePage.CONTENT_TYPE_NAME, StringComparison.OrdinalIgnorecase))
Expand Down
15 changes: 9 additions & 6 deletions docs/Managing-Indexes.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,19 +12,22 @@ Create a new index or select and index to edit by clicking the index row or the

Fill out the search index form, populating the fields with your custom values.

![Administration search index list](/images/xperience-administration-search-index-edit-form.jpg)
![Administration search index edit form](/images/xperience-administration-search-index-edit-form.jpg)

- Rebuild Hook - for validating a request rebuild of the search index from an external source (ex: API request)
- Indexed Languages - the index will only include content in the selected languages
- Channel Name - the index will only be triggered by web page item creation or modication in the selected website channel
- Index Name - the name of the displayed index.
- Included Reusable Content Types - these are the reusable content types that will be processed by your custom indexing strategy.
If no option is selected, no items will be processed.
- Indexed Languages - the index will only include content in the selected languages.
- Channel Name - the index will only be triggered by web page item creation or modification in the selected website channel.
- Indexing Strategy - the indexing strategy specified in code during dependency registration of a custom indexing strategies.
- If you want the default strategy to appear here, register it explicitly in `IServiceCollection.AddKenticoLucene()` method
- If you want the default strategy to appear here, register it explicitly in `IServiceCollection.AddKenticoLucene()` method.
- Lucene Analyzer - the Lucene analyzer which indexes use to analyze text.
- Rebuild Hook - for validating a request rebuild of the search index from an external source (ex: API request).

Now, configure the web page paths and content types that the search index depends on by clicking the Add New Path button
or clicking an existing path in the table at the top of the index configuration form.

![Administration search index list](/images/xperience-administration-search-index-edit-form-paths-edit.jpg)
![Administration search index edit paths form](/images/xperience-administration-search-index-edit-form-paths-edit.jpg)

- Included Path - can be an exact relative path of a web page item, (ex: `/path/to/my/page`), or a wildcard path (ex: `/parent-path/%`)
- To determine a web page path, select the web page in the website channel page tree, then view the "Current URL" in the Content tab of the web page. The path will be the relative path excluding the domain
Expand Down
10 changes: 5 additions & 5 deletions docs/Scraping-web-page-content.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ public class WebCrawlerService
ex,
$"Tree Path: {page.SystemFields.WebPageItemTreePath}");
}
return "";
return string.Empty;
}

public async Task<string> CrawlPage(string url)
Expand All @@ -75,7 +75,7 @@ public class WebCrawlerService
ex,
$"Url: {url}");
}
return "";
return string.Empty;
}
}
```
Expand Down Expand Up @@ -136,8 +136,8 @@ public class WebScraperHtmlSanitizer
textContent = HTMLHelper.RegexHtmlToTextWhiteSpace.Replace(textContent, " ");
textContent = textContent.Trim();

string title = doc.Head?.QuerySelector("title")?.TextContent ?? "";
string description = doc.Head?.QuerySelector("meta[name='description']")?.GetAttribute("content") ?? "";
string title = doc.Head?.QuerySelector("title")?.TextContent ?? string.Empty;
string description = doc.Head?.QuerySelector("meta[name='description']")?.GetAttribute("content") ?? string.Empty;

return string.Join(
" ",
Expand Down Expand Up @@ -190,7 +190,7 @@ public override async Task<Document?> MapToLuceneDocumentOrNull(IIndexEventItemM
// Setup same as examples in Usage-Guide.md
// ...

string content = "";
string content = string.Empty;

if (item is IndexEventWebPageItemModel webpageItem &&
string.Equals(indexedModel.ContentTypeName, ArticlePage.CONTENT_TYPE_NAME, StringComparison.OrdinalIgnorecase))
Expand Down
8 changes: 4 additions & 4 deletions docs/Search-index-querying.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,9 @@ To index all existing content, rebuild the index in Xperience's Administration w
```csharp
public class GlobalSearchResultModel
{
public string Title { get; set; } = "";
public string ContentType { get; set; } = "";
public string Url { get; set; } = "";
public string Title { get; set; } = string.Empty;
public string ContentType { get; set; } = string.Empty;
public string Url { get; set; } = string.Empty;

public static List<string> PossibleFacets { get; set; } = new List<string>
{
Expand Down Expand Up @@ -105,7 +105,7 @@ public class SearchService

return new LuceneSearchResultModel<GlobalSearchResultModel>
{
Query = searchText ?? "",
Query = searchText ?? string.Empty,
Page = page,
PageSize = pageSize,
TotalPages = topDocs.TotalHits <= 0 ? 0 : ((topDocs.TotalHits - 1) / pageSize) + 1,
Expand Down
86 changes: 42 additions & 44 deletions examples/DancingGoat/Search/AdvancedSearchIndexingStrategy.cs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ namespace DancingGoat.Search;

public class AdvancedSearchIndexingStrategy : DefaultLuceneIndexingStrategy
{
public static string SORTABLE_TITLE_FIELD_NAME = "SortableTitle";
public const string SORTABLE_TITLE_FIELD_NAME = "SortableTitle";

private readonly IWebPageQueryResultMapper webPageMapper;
private readonly IContentQueryExecutor queryExecutor;
Expand Down Expand Up @@ -43,60 +43,58 @@ WebCrawlerService webCrawler
{
var document = new Document();

string sortableTitle = "";
string title = "";
string content = "";
string sortableTitle = string.Empty;
string title = string.Empty;
string content = string.Empty;

// IIndexEventItemModel could be a reusable content item or a web page item, so we use
// pattern matching to get access to the web page item specific type and fields
if (item is IndexEventWebPageItemModel indexedPage)
if (item is not IndexEventWebPageItemModel indexedPage)
{
if (string.Equals(item.ContentTypeName, ArticlePage.CONTENT_TYPE_NAME, StringComparison.OrdinalIgnoreCase))
return null;
}

if (string.Equals(item.ContentTypeName, ArticlePage.CONTENT_TYPE_NAME, StringComparison.OrdinalIgnoreCase))
{
// The implementation of GetPage<T>() is below
var page = await GetPage<ArticlePage>(
indexedPage.ItemGuid,
indexedPage.WebsiteChannelName,
indexedPage.LanguageName,
ArticlePage.CONTENT_TYPE_NAME);

if (page is null)
{
// The implementation of GetPage<T>() is below
var page = await GetPage<ArticlePage>(
indexedPage.ItemGuid,
indexedPage.WebsiteChannelName,
indexedPage.LanguageName,
ArticlePage.CONTENT_TYPE_NAME);

if (page is null)
{
return null;
}

sortableTitle = title = page?.ArticleTitle ?? "";

string rawContent = await webCrawler.CrawlWebPage(page!);
content = htmlSanitizer.SanitizeHtmlDocument(rawContent);
return null;
}
else if (string.Equals(item.ContentTypeName, HomePage.CONTENT_TYPE_NAME, StringComparison.OrdinalIgnoreCase))

sortableTitle = title = page?.ArticleTitle ?? string.Empty;

string rawContent = await webCrawler.CrawlWebPage(page!);
content = htmlSanitizer.SanitizeHtmlDocument(rawContent);
}
else if (string.Equals(item.ContentTypeName, HomePage.CONTENT_TYPE_NAME, StringComparison.OrdinalIgnoreCase))
{
var page = await GetPage<HomePage>(
indexedPage.ItemGuid,
indexedPage.WebsiteChannelName,
indexedPage.LanguageName,
HomePage.CONTENT_TYPE_NAME);

if (page is null)
{
var page = await GetPage<HomePage>(
indexedPage.ItemGuid,
indexedPage.WebsiteChannelName,
indexedPage.LanguageName,
HomePage.CONTENT_TYPE_NAME);

if (page is null)
{
return null;
}

if (page.HomePageBanner.IsNullOrEmpty())
{
return null;
}

sortableTitle = title = page!.HomePageBanner.First().BannerText;

string rawContent = await webCrawler.CrawlWebPage(page!);
content = htmlSanitizer.SanitizeHtmlDocument(rawContent);
return null;
}
else

if (page.HomePageBanner.IsNullOrEmpty())
{
return null;
}

sortableTitle = title = page!.HomePageBanner.First().BannerText;

string rawContent = await webCrawler.CrawlWebPage(page!);
content = htmlSanitizer.SanitizeHtmlDocument(rawContent);
}
else
{
Expand Down
6 changes: 3 additions & 3 deletions examples/DancingGoat/Search/DancingGoatSearchResultModel.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

public class DancingGoatSearchResultModel
{
public string Title { get; set; } = "";
public string ContentType { get; set; } = "";
public string Url { get; set; } = "";
public string Title { get; set; } = string.Empty;
public string ContentType { get; set; } = string.Empty;
public string Url { get; set; } = string.Empty;
}
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ public static IServiceCollection AddKenticoDancingGoatLuceneServices(this IServi
{
builder.RegisterStrategy<AdvancedSearchIndexingStrategy>("DancingGoatExampleStrategy");
builder.RegisterStrategy<SimpleSearchIndexingStrategy>("DancingGoatMinimalExampleStrategy");
builder.RegisterStrategy<ReusableContentItemsIndexingStrategy>(nameof(ReusableContentItemsIndexingStrategy));
builder.RegisterAnalyzer<CzechAnalyzer>("Czech analyzer");
});

Expand Down
113 changes: 113 additions & 0 deletions examples/DancingGoat/Search/ReusableContentItemsIndexingStrategy.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
using CMS.ContentEngine;
using CMS.Websites;

using DancingGoat.Models;
using DancingGoat.Search.Services;

using Kentico.Xperience.Lucene.Core;
using Kentico.Xperience.Lucene.Core.Indexing;

using Lucene.Net.Documents;
using Lucene.Net.Facet;

namespace DancingGoat.Search;

public class ReusableContentItemsIndexingStrategy : DefaultLuceneIndexingStrategy
{
public const string SORTABLE_TITLE_FIELD_NAME = "SortableTitle";

private readonly IWebPageQueryResultMapper webPageMapper;
private readonly IContentQueryExecutor queryExecutor;
private readonly IWebPageUrlRetriever urlRetriever;
private readonly WebScraperHtmlSanitizer htmlSanitizer;
private readonly WebCrawlerService webCrawler;

public const string FACET_DIMENSION = "ContentType";
public const string INDEXED_WEBSITECHANNEL_NAME = "DancingGoatPages";
public const string CRAWLER_CONTENT_FIELD_NAME = "Content";

public ReusableContentItemsIndexingStrategy(
IWebPageQueryResultMapper webPageMapper,
IContentQueryExecutor queryExecutor,
IWebPageUrlRetriever urlRetriever,
WebScraperHtmlSanitizer htmlSanitizer,
WebCrawlerService webCrawler
)
{
this.urlRetriever = urlRetriever;
this.webPageMapper = webPageMapper;
this.queryExecutor = queryExecutor;
this.htmlSanitizer = htmlSanitizer;
this.webCrawler = webCrawler;
}

public override async Task<Document?> MapToLuceneDocumentOrNull(IIndexEventItemModel item)
bkapustik marked this conversation as resolved.
Show resolved Hide resolved
{
var document = new Document();

string sortableTitle = string.Empty;
string title = string.Empty;
string content = string.Empty;

// IIndexEventItemModel could be a reusable content item or a web page item, so we use
// pattern matching to get access to the web page item specific type and fields
if (item is not IndexEventReusableItemModel indexedItem)
{
return null;
}

if (!string.Equals(item.ContentTypeName, Banner.CONTENT_TYPE_NAME, StringComparison.OrdinalIgnoreCase))
{
return null;
}

var query = new ContentItemQueryBuilder()
.ForContentType(HomePage.CONTENT_TYPE_NAME,
config =>
config
.WithLinkedItems(4)

// Because the changedItem is a reusable content item, we don't have a website channel name to use here
// so we use a hardcoded channel name.
.ForWebsite(INDEXED_WEBSITECHANNEL_NAME)

// Retrieves all HomePages that link to the Banner through the HomePage.HomePageBanner field
.Linking(nameof(HomePage.HomePageBanner), new[] { indexedItem.ItemID }))
.InLanguage(indexedItem.LanguageName);

var associatedWebPageItem = (await queryExecutor.GetWebPageResult(query, webPageMapper.Map<HomePage>)).First();
string url = string.Empty;
try
{
url = (await urlRetriever.Retrieve(associatedWebPageItem.SystemFields.WebPageItemTreePath,
INDEXED_WEBSITECHANNEL_NAME, indexedItem.LanguageName)).RelativePath;
}
catch (Exception)
{
// Retrieve can throw an exception when processing a page update LuceneQueueItem
// and the page was deleted before the update task has processed. In this case, return no item.
return null;
}

sortableTitle = title = associatedWebPageItem!.HomePageBanner.First().BannerText;
string rawContent = await webCrawler.CrawlWebPage(associatedWebPageItem!);
content = htmlSanitizer.SanitizeHtmlDocument(rawContent);

//If the indexed item is a reusable content item, we need to set the url manually.
document.Add(new StringField(BaseDocumentProperties.URL, url, Field.Store.YES));
document.Add(new TextField(nameof(DancingGoatSearchResultModel.Title), title, Field.Store.YES));
document.Add(new StringField(SORTABLE_TITLE_FIELD_NAME, sortableTitle, Field.Store.YES));
document.Add(new TextField(CRAWLER_CONTENT_FIELD_NAME, content, Field.Store.NO));

return document;
}

public override FacetsConfig FacetsConfigFactory()
{
var facetConfig = new FacetsConfig();

facetConfig.SetMultiValued(FACET_DIMENSION, true);

return facetConfig;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ public LuceneSearchResultModel<DancingGoatSearchResultModel> GlobalSearch(

return new LuceneSearchResultModel<DancingGoatSearchResultModel>
{
Query = searchText ?? "",
Query = searchText ?? string.Empty,
Page = page,
PageSize = pageSize,
TotalPages = topDocs.TotalHits <= 0 ? 0 : ((topDocs.TotalHits - 1) / pageSize) + 1,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ public LuceneSearchResultModel<DancingGoatSearchResultModel> GlobalSearch(

return new LuceneSearchResultModel<DancingGoatSearchResultModel>
{
Query = searchText ?? "",
Query = searchText ?? string.Empty,
Page = page,
PageSize = pageSize,
TotalPages = topDocs.TotalHits <= 0 ? 0 : ((topDocs.TotalHits - 1) / pageSize) + 1,
Expand Down
Loading
Loading