Skip to content

Commit

Permalink
Docstore documentation
Browse files Browse the repository at this point in the history
  • Loading branch information
kohler committed Sep 27, 2024
1 parent 36f5300 commit a1caa74
Show file tree
Hide file tree
Showing 8 changed files with 121 additions and 60 deletions.
56 changes: 33 additions & 23 deletions batch/cleandocstore.php
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ class CleanDocstore_Batch {
public $dry_run;
/** @var bool */
public $keep_temp;
/** @var bool */
public $only_temp;
/** @var int */
public $cutoff;
/** @var DocumentHashMatcher */
Expand All @@ -49,10 +51,17 @@ function __construct(Conf $conf, $docstores, $arg) {
$this->verbose = isset($arg["verbose"]);
$this->dry_run = isset($arg["dry-run"]);
$this->keep_temp = isset($arg["keep-temp"]);
$this->only_temp = isset($arg["only-temp"]) || !$conf->s3_client();
$this->cutoff = isset($arg["all"]) ? Conf::$now + 86400 : Conf::$now - 86400;
$this->hash_matcher = new DocumentHashMatcher($arg["match"] ?? null);
}

/** @param DocumentFileTreeMatch $fm
* @return bool */
static function is_temp($fm) {
return $fm->tree->treeid === 1;
}

/** @return ?DocumentFileTreeMatch */
function fparts_random_match() {
$fmatches = [];
Expand All @@ -66,7 +75,7 @@ function fparts_random_match() {
++$j) {
$fm = $ftree->random_match();
if ($fm->is_complete()
&& (($fm->treeid & 1) === 0
&& (!self::is_temp($fm)
|| max($fm->atime(), $fm->mtime()) < $this->cutoff)) {
++$n;
$fmatches[] = $fm;
Expand All @@ -75,7 +84,8 @@ function fparts_random_match() {
}
}
if ($n === 0) {
$this->ftrees[$i] = null;
array_splice($this->ftrees, $i, 1);
--$i;
}
}
usort($fmatches, function ($a, $b) {
Expand All @@ -86,11 +96,11 @@ function fparts_random_match() {
return $at ? -1 : ($bt ? 1 : 0);
}
$aage = Conf::$now - $at;
if ($a->treeid & 1) {
if (self::is_temp($a)) {
$aage = $aage > 604800 ? 100000000 : $aage * 2;
}
$bage = Conf::$now - $bt;
if ($b->treeid & 1) {
if (self::is_temp($b)) {
$bage = $bage > 604800 ? 100000000 : $bage * 2;
}
return $bage <=> $aage;
Expand All @@ -99,7 +109,7 @@ function fparts_random_match() {
return null;
} else {
$fm = $fmatches[0];
$this->ftrees[$fm->treeid]->hide($fm);
$fm->tree->hide($fm);
return $fm;
}
}
Expand Down Expand Up @@ -142,7 +152,7 @@ function run() {
}

if (empty($this->docstores) || !$this->conf->docstore()) {
throw new ErrorException("No docstore to clean");
throw new CommandLineException("No docstore to clean");
}

preg_match('/\A((?:\/[^\/%]*(?=\/|\z))+)/', $this->docstores[0], $m);
Expand All @@ -153,7 +163,7 @@ function run() {
$ts = disk_total_space($usage_directory);
$fs = disk_free_space($usage_directory);
if ($ts === false || $fs === false) {
throw new ErrorException("{$usage_directory}: Cannot evaluate free space");
throw new CommandLineException("{$usage_directory}: Cannot evaluate free space");
} else if ($fs >= $ts * (1 - ($this->max_usage ?? $this->min_usage))) {
if (!$this->quiet) {
fwrite(STDOUT, "{$usage_directory}: free space sufficient\n");
Expand All @@ -170,13 +180,13 @@ function run() {

foreach ($this->docstores as $i => $dp) {
if (!str_starts_with($dp, "/") || strpos($dp, "%") === false) {
throw new ErrorException("{$dp}: Bad docstore pattern");
throw new CommandLineException("{$dp}: Bad docstore pattern");
}
if (!$this->only_temp) {
$this->ftrees[] = new DocumentFileTree($dp, $this->hash_matcher, 0);
}
$this->ftrees[] = new DocumentFileTree($dp, $this->hash_matcher, count($this->ftrees));
if (!$this->keep_temp) {
$this->ftrees[] = new DocumentFileTree(Filer::docstore_fixed_prefix($dp) . "tmp/%w", $this->hash_matcher, count($this->ftrees));
} else {
$this->ftrees[] = null;
$this->ftrees[] = new DocumentFileTree(Filer::docstore_fixed_prefix($dp) . "tmp/%w", $this->hash_matcher, 1);
}
}

Expand All @@ -185,8 +195,7 @@ function run() {
while ($count > 0
&& ($usage_threshold === null || $bytesremoved < $usage_threshold)
&& ($fm = $this->fparts_random_match())) {
if (($fm->treeid & 1) !== 0
|| $this->check_match($fm)) {
if (self::is_temp($fm) || $this->check_match($fm)) {
$size = filesize($fm->fname);
if ($this->dry_run || unlink($fm->fname)) {
if ($this->verbose) {
Expand All @@ -205,30 +214,31 @@ function run() {
if (!$this->quiet) {
fwrite(STDOUT, $usage_directory . ": " . ($this->dry_run ? "would remove " : "removed ") . plural($nsuccess, "file") . ", " . plural($bytesremoved, "byte") . "\n");
}
if ($nsuccess == 0) {
fwrite(STDERR, "Nothing to delete\n");
if ($nsuccess === 0 && !$this->quiet) {
fwrite(STDERR, "Nothing to clean\n");
}
return $nsuccess && $nsuccess == $ndone ? 0 : 1;
return $nsuccess > 0 && $nsuccess === $ndone ? 0 : 1;
}

/** @return CleanDocstore_Batch */
static function make_args($argv) {
$arg = (new Getopt)->long(
"name:,n: !",
"config: !",
"help,h",
"count:,c: {n} =COUNT Clean up to COUNT files",
"match:,m: =MATCH Clean files matching MATCH",
"verbose,V",
"dry-run,d Do not remove files",
"max-usage:,u: {f} =FRAC Clean until usage is below FRAC",
"min-usage:,U: {f} =FRAC Do not clean if usage is below FRAC",
"all Clean all files, including files recently modified",
"quiet,silent,q",
"keep-temp",
"docstore"
"quiet,silent,q Be quiet",
"keep-temp Keep temporary files",
"only-temp Only clean temporary files",
"help,h",
"verbose,V Be more verbose",
"docstore Output docstore patterns and exit"
)->helpopt("help")
->description("Remove files from HotCRP docstore that are on S3.
->description("Remove old files from HotCRP docstore
Usage: php batch/cleandocstore.php [-c COUNT|-u FRAC] [-V] [-d] [DOCSTORES...]\n")
->parse($argv);

Expand Down
66 changes: 66 additions & 0 deletions devel/manual/docstore.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# HotCRP document store

HotCRP submission metadata are stored in the per-conference MySQL database. In
default installations, document contents, such as PDF files, are also stored
in the database. However, the system can also be configured to store document
contents on the filesystem in a **document store** or **docstore**.


## Configuring the docstore

Set up a document store by configuring `$Opt["docstore"]`. This setting is a
filename pattern that sets where documents are stored on the filesystem.

To determine the filename for a document, HotCRP expands `%` escapes in
`$Opt["docstore"]` using document information. The escapes are:

| Escape | Meaning | Examples |
|:-------|:--------|:---------|
| `%H` | Content hash | `d16c7976d9081368c7dca2da3a771065c3222069a1ad80dcd99d972b2efadc8b` |
| `%NH` | First `N` bytes of content hash | `d16` (for `%3H`) |
| `%a` | Hash algorithm | `sha256`, `sha1` |
| `%A` | Hash algorithm prefix | `sha2-` (for SHA-256), empty string (for SHA-1) |
| `%h` | Content hash with algorithm prefix | `sha2-d16c7976d9081368c7dca2da3a771065c3222069a1ad80dcd99d972b2efadc8b` |
| `%Nh` | First `N` bytes of content hash with algorithm prefix | `sha2-d16` (for `%3h`) |
| `%x` | File extension | `.pdf`, `.txt` |
| `%%` | Literal `%` | `%` |

A full `$Opt["docstore"]` setting must include a full hash (`%H` or `%h`). If
`$Opt["docstore"]` does not include a `%` sign, then HotCRP automatically
appends `/%h%x` to the setting value, and if `$Opt["docstore"]` is `true`,
HotCRP uses `docs/%h%x`. Relative paths are interpreted relative to the HotCRP
installation directory.

The HotCRP PHP server must have read and write permission to the document
store. `php-fpm` and/or `httpd` typically own the docstore directory, or they
have group access (and the docstore direcrory has set-group-id permission).
HotCRP will create subdirectories as necessary; for instance, with docstore
`"docs/%2h/%H%x`, HotCRP might try to create the docstore subdirectory
`docs/sha2-d1` to fit a file with SHA-256 hash
`d16c7976d9081368c7dca2da3a771065c3222069a1ad80dcd99d972b2efadc8b`.


## Temporary docstore

A special subdirectory of the docstore is used for large temporary files,
especially files that may need to outlive a single request. Examples include
chunks of uploaded documents and constructed ZIP archives and CSV files.

To form the temporary docstore, HotCRP appends `/tmp` to the docstore’s fixed
prefix. For example, the docstore `"/home/hotcrp/docs/sub-%3h/%H%x"` has
temporary docstore `/home/hotcrp/docs/tmp`.

The temporary docstore should be cleaned periodically, for instance by the
batch script `php batch/cleandocstore.php`.


## Docstore, database, and S3

HotCRP can store documents in the MySQL database (the default), in the
docstore, and in Amazon S3. Amazon S3 is typically the slowest of these
methods, but needs no separate backup. If you have configured either the
docstore or S3, you can disable database storage by setting
`$Opt["dbNoPapers"]` to `true`. If you have configured the docstore *and* S3,
then the docstore can act as a cache for S3. Incoming documents are stored in
both places; if a docstore version is missing later, HotCRP will check S3 for
it.
1 change: 1 addition & 0 deletions devel/manual/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ configuring features inaccessible through the public settings UI.
* [**Components**](./components.md): How HotCRP configuration JSON files work
* [**Message formatting and translation**](./fmt.md)
* [**Page configuration**](./pages.md)
* [**Document store**](./docstore.md)

## Specific pages

Expand Down
16 changes: 7 additions & 9 deletions devel/manual/sessions.md
Original file line number Diff line number Diff line change
@@ -1,14 +1,12 @@
# HotCRP session data
# HotCRP sessions

This page describes the format of HotCRP session data.
Each HotCRP session contains both **global** data, relevant for all
conferences attached to a session, and **conference** data relevant to a
single conference. Session data is stored in PHP’s native [session
encoding](https://www.php.net/manual/en/function.session-encode.php).

HotCRP sessions contain both **global** data, relevant for all conferences
attached to a session, and **conference** data, which is relevant to a single
conference. Conferences are distinguished by their **session keys**, which are
`@` followed by the conference’s database name.


## Global keys
## Global session data

### Session version and expiration

Expand Down Expand Up @@ -87,7 +85,7 @@ conference. Conferences are distinguished by their **session keys**, which are
* `login_bounce`


## Conference keys
## Conference session data

Session data relevant to one conference is stored in the session element named
by the conference’s session key, e.g., `@db-sigcomm23`. This element is
Expand Down
8 changes: 3 additions & 5 deletions etc/distoptions.php
Original file line number Diff line number Diff line change
Expand Up @@ -115,15 +115,13 @@
// docstore Set to true to serve papers and other downloads from a
// cache on the local filesystem. By default this cache is
// created in the "docs" directory. You can also set
// $Opt["docstore"] to a directory name.
// docstoreSubdir Set to true (or a small number, like 3) if the document
// store should use subdirectories. This can be useful if
// you expect thousands of submissions.
// $Opt["docstore"] to a directory name, or to a directory
// pattern such as "docs/%2H/%h%x".
// s3_bucket Amazon S3 bucket name to store paper submissions.
// s3_key Amazon AWS access key ID (used for S3).
// s3_secret Amazon AWS secret access key (used for S3).
// dbNoPapers Set to true to not store papers in the database.
// Requires filestore, S3 storage, or both.
// Requires docstore, S3 storage, or both.


// TIMES AND DATES
Expand Down
10 changes: 2 additions & 8 deletions src/api/api_tags.php
Original file line number Diff line number Diff line change
Expand Up @@ -111,9 +111,7 @@ static function run(Contact $user, $qreq, $prow) {
}

$assigner = new AssignmentSet($user);
if ($prow) {
$assigner->enable_papers($prow);
}
$assigner->enable_papers($prow);
$assigner->parse(join("\n", $x));
$mlist = $assigner->message_list();
$ok = $assigner->execute();
Expand Down Expand Up @@ -144,9 +142,8 @@ static function run(Contact $user, $qreq, $prow) {
}

/** @param Qrequest $qreq
* @param ?PaperInfo $prow
* @return JsonResult */
static function assigntags(Contact $user, $qreq, $prow) {
static function assigntags(Contact $user, $qreq) {
if (!isset($qreq->tagassignment)) {
return JsonResult::make_missing_error("tagassignment");
}
Expand All @@ -162,9 +159,6 @@ static function assigntags(Contact $user, $qreq, $prow) {
}

$assigner = new AssignmentSet($user);
if ($prow) {
$assigner->enable_papers($prow);
}
$assigner->parse(join("\n", $x));
$mlist = $assigner->message_list();
$ok = $assigner->execute();
Expand Down
7 changes: 0 additions & 7 deletions src/conference.php
Original file line number Diff line number Diff line change
Expand Up @@ -674,13 +674,6 @@ function refresh_options() {
$dpath = $docstore;
} else if ($docstore === true) {
$dpath = "docs";
} else if ($docstore === null && isset($this->opt["filestore"])) {
if (is_string($this->opt["filestore"])) {
$dpath = $this->opt["filestore"];
} else if ($this->opt["filestore"] === true) {
$dpath = "filestore";
}
$dpsubdir = $this->opt["filestoreSubdir"] ?? null;
}
if ($dpath !== "") {
if ($dpath[0] !== "/") {
Expand Down
17 changes: 9 additions & 8 deletions src/documentfiletree.php
Original file line number Diff line number Diff line change
Expand Up @@ -277,7 +277,7 @@ static function random_index($di) {
/** @return DocumentFileTreeMatch */
function first_match(?DocumentFileTreeMatch $after = null) {
$this->clear();
$fm = new DocumentFileTreeMatch($this->treeid);
$fm = new DocumentFileTreeMatch($this);
for ($i = 0; $i < $this->_n; ++$i) {
if ($i % 2 === 0) {
$fm->fname .= $this->_components[$i];
Expand All @@ -297,7 +297,7 @@ function first_match(?DocumentFileTreeMatch $after = null) {
/** @return DocumentFileTreeMatch */
function random_match() {
$this->clear();
$fm = new DocumentFileTreeMatch($this->treeid);
$fm = new DocumentFileTreeMatch($this);
for ($i = 0; $i < $this->_n; ++$i) {
if ($i % 2 === 0) {
$fm->fname .= $this->_components[$i];
Expand All @@ -316,7 +316,7 @@ function random_match() {

function hide(DocumentFileTreeMatch $fm) {
// account for removal
assert($fm->treeid === $this->treeid);
assert($fm->tree === $this);
for ($i = count($fm->idxes) - 1; $i >= 0; --$i) {
$this->_dirinfo[$fm->bdirs[$i]]->hide_component_index($fm->idxes[$i]);
}
Expand All @@ -342,8 +342,9 @@ function jsonSerialize() {
}

class DocumentFileTreeMatch {
/** @var int */
public $treeid;
/** @var DocumentFileTree
* @readonly */
public $tree;
/** @var list<string> */
public $bdirs = [];
/** @var list<int> */
Expand All @@ -359,9 +360,9 @@ class DocumentFileTreeMatch {
/** @var null|int|false */
private $_mtime;

/** @param int $treeid */
function __construct($treeid) {
$this->treeid = $treeid;
/** @param DocumentFileTree $tree */
function __construct($tree) {
$this->tree = $tree;
}
/** @param int $idx
* @param string $suffix */
Expand Down

0 comments on commit a1caa74

Please sign in to comment.