This commit is contained in:
2025-12-05 10:55:37 +01:00
commit 396e290394
1423 changed files with 825479 additions and 0 deletions

View File

@@ -0,0 +1,5 @@
# this is here so that django finds the checks.
from documents.checks import changed_password_check
from documents.checks import parser_check
__all__ = ["changed_password_check", "parser_check"]

229
src/documents/admin.py Normal file
View File

@@ -0,0 +1,229 @@
from django.conf import settings
from django.contrib import admin
from guardian.admin import GuardedModelAdmin
from treenode.admin import TreeNodeModelAdmin
from documents.models import Correspondent
from documents.models import CustomField
from documents.models import CustomFieldInstance
from documents.models import Document
from documents.models import DocumentType
from documents.models import Note
from documents.models import PaperlessTask
from documents.models import SavedView
from documents.models import SavedViewFilterRule
from documents.models import ShareLink
from documents.models import StoragePath
from documents.models import Tag
from documents.tasks import update_document_parent_tags
if settings.AUDIT_LOG_ENABLED:
from auditlog.admin import LogEntryAdmin
from auditlog.models import LogEntry
class CorrespondentAdmin(GuardedModelAdmin):
list_display = ("name", "match", "matching_algorithm")
list_filter = ("matching_algorithm",)
list_editable = ("match", "matching_algorithm")
class TagAdmin(GuardedModelAdmin, TreeNodeModelAdmin):
list_display = ("name", "color", "match", "matching_algorithm")
list_filter = ("matching_algorithm",)
list_editable = ("color", "match", "matching_algorithm")
search_fields = ("color", "name")
def save_model(self, request, obj, form, change):
old_parent = None
if change and obj.pk:
tag = Tag.objects.get(pk=obj.pk)
old_parent = tag.get_parent() if tag else None
super().save_model(request, obj, form, change)
# sync parent tags on documents if changed
new_parent = obj.get_parent()
if new_parent and old_parent != new_parent:
update_document_parent_tags(obj, new_parent)
class DocumentTypeAdmin(GuardedModelAdmin):
list_display = ("name", "match", "matching_algorithm")
list_filter = ("matching_algorithm",)
list_editable = ("match", "matching_algorithm")
class DocumentAdmin(GuardedModelAdmin):
search_fields = ("correspondent__name", "title", "content", "tags__name")
readonly_fields = (
"added",
"modified",
"mime_type",
"storage_type",
"filename",
"checksum",
"archive_filename",
"archive_checksum",
"original_filename",
"deleted_at",
)
list_display_links = ("title",)
list_display = ("id", "title", "mime_type", "filename", "archive_filename")
list_filter = (
("mime_type"),
("archive_serial_number", admin.EmptyFieldListFilter),
("archive_filename", admin.EmptyFieldListFilter),
)
filter_horizontal = ("tags",)
ordering = ["-id"]
date_hierarchy = "created"
def has_add_permission(self, request):
return False
def created_(self, obj):
return obj.created.date().strftime("%Y-%m-%d")
created_.short_description = "Created"
def get_queryset(self, request): # pragma: no cover
"""
Include trashed documents
"""
return Document.global_objects.all()
def delete_queryset(self, request, queryset):
from documents import index
with index.open_index_writer() as writer:
for o in queryset:
index.remove_document(writer, o)
super().delete_queryset(request, queryset)
def delete_model(self, request, obj):
from documents import index
index.remove_document_from_index(obj)
super().delete_model(request, obj)
def save_model(self, request, obj, form, change):
from documents import index
index.add_or_update_document(obj)
super().save_model(request, obj, form, change)
class RuleInline(admin.TabularInline):
model = SavedViewFilterRule
class SavedViewAdmin(GuardedModelAdmin):
list_display = ("name", "owner")
inlines = [RuleInline]
def get_queryset(self, request): # pragma: no cover
return super().get_queryset(request).select_related("owner")
class StoragePathInline(admin.TabularInline):
model = StoragePath
class StoragePathAdmin(GuardedModelAdmin):
list_display = ("name", "path", "match", "matching_algorithm")
list_filter = ("path", "matching_algorithm")
list_editable = ("path", "match", "matching_algorithm")
class TaskAdmin(admin.ModelAdmin):
list_display = ("task_id", "task_file_name", "task_name", "date_done", "status")
list_filter = ("status", "date_done", "task_name")
search_fields = ("task_name", "task_id", "status", "task_file_name")
readonly_fields = (
"task_id",
"task_file_name",
"task_name",
"status",
"date_created",
"date_started",
"date_done",
"result",
)
class NotesAdmin(GuardedModelAdmin):
list_display = ("user", "created", "note", "document")
list_filter = ("created", "user")
list_display_links = ("created",)
raw_id_fields = ("document",)
search_fields = ("document__title",)
def get_queryset(self, request): # pragma: no cover
return (
super()
.get_queryset(request)
.select_related("user", "document__correspondent")
)
class ShareLinksAdmin(GuardedModelAdmin):
list_display = ("created", "expiration", "document")
list_filter = ("created", "expiration", "owner")
list_display_links = ("created",)
raw_id_fields = ("document",)
def get_queryset(self, request): # pragma: no cover
return super().get_queryset(request).select_related("document__correspondent")
class CustomFieldsAdmin(GuardedModelAdmin):
fields = ("name", "created", "data_type")
readonly_fields = ("created", "data_type")
list_display = ("name", "created", "data_type")
list_filter = ("created", "data_type")
class CustomFieldInstancesAdmin(GuardedModelAdmin):
fields = ("field", "document", "created", "value")
readonly_fields = ("field", "document", "created", "value")
list_display = ("field", "document", "value", "created")
search_fields = ("document__title",)
list_filter = ("created", "field")
def get_queryset(self, request): # pragma: no cover
return (
super()
.get_queryset(request)
.select_related("field", "document__correspondent")
)
admin.site.register(Correspondent, CorrespondentAdmin)
admin.site.register(Tag, TagAdmin)
admin.site.register(DocumentType, DocumentTypeAdmin)
admin.site.register(Document, DocumentAdmin)
admin.site.register(SavedView, SavedViewAdmin)
admin.site.register(StoragePath, StoragePathAdmin)
admin.site.register(PaperlessTask, TaskAdmin)
admin.site.register(Note, NotesAdmin)
admin.site.register(ShareLink, ShareLinksAdmin)
admin.site.register(CustomField, CustomFieldsAdmin)
admin.site.register(CustomFieldInstance, CustomFieldInstancesAdmin)
if settings.AUDIT_LOG_ENABLED:
class LogEntryAUDIT(LogEntryAdmin):
def has_delete_permission(self, request, obj=None):
return False
admin.site.unregister(LogEntry)
admin.site.register(LogEntry, LogEntryAUDIT)

33
src/documents/apps.py Normal file
View File

@@ -0,0 +1,33 @@
from django.apps import AppConfig
from django.utils.translation import gettext_lazy as _
class DocumentsConfig(AppConfig):
name = "documents"
verbose_name = _("Documents")
def ready(self):
from documents.signals import document_consumption_finished
from documents.signals import document_updated
from documents.signals.handlers import add_inbox_tags
from documents.signals.handlers import add_to_index
from documents.signals.handlers import run_workflows_added
from documents.signals.handlers import run_workflows_updated
from documents.signals.handlers import set_correspondent
from documents.signals.handlers import set_document_type
from documents.signals.handlers import set_storage_path
from documents.signals.handlers import set_tags
document_consumption_finished.connect(add_inbox_tags)
document_consumption_finished.connect(set_correspondent)
document_consumption_finished.connect(set_document_type)
document_consumption_finished.connect(set_tags)
document_consumption_finished.connect(set_storage_path)
document_consumption_finished.connect(add_to_index)
document_consumption_finished.connect(run_workflows_added)
document_updated.connect(run_workflows_updated)
import documents.schema # noqa: F401
AppConfig.ready(self)

496
src/documents/barcodes.py Normal file
View File

@@ -0,0 +1,496 @@
from __future__ import annotations
import logging
import re
import tempfile
from dataclasses import dataclass
from pathlib import Path
from typing import TYPE_CHECKING
from django.conf import settings
from pdf2image import convert_from_path
from pikepdf import Page
from pikepdf import PasswordError
from pikepdf import Pdf
from documents.converters import convert_from_tiff_to_pdf
from documents.data_models import ConsumableDocument
from documents.data_models import DocumentMetadataOverrides
from documents.models import Tag
from documents.plugins.base import ConsumeTaskPlugin
from documents.plugins.base import StopConsumeTaskError
from documents.plugins.helpers import ProgressManager
from documents.plugins.helpers import ProgressStatusOptions
from documents.utils import copy_basic_file_stats
from documents.utils import copy_file_with_basic_stats
from documents.utils import maybe_override_pixel_limit
from paperless.config import BarcodeConfig
if TYPE_CHECKING:
from collections.abc import Callable
from PIL import Image
logger = logging.getLogger("paperless.barcodes")
@dataclass(frozen=True)
class Barcode:
"""
Holds the information about a single barcode and its location in a document
"""
page: int
value: str
settings: BarcodeConfig
@property
def is_separator(self) -> bool:
"""
Returns True if the barcode value equals the configured separation value,
False otherwise
"""
return self.value == self.settings.barcode_string
@property
def is_asn(self) -> bool:
"""
Returns True if the barcode value matches the configured ASN prefix,
False otherwise
"""
return self.value.startswith(self.settings.barcode_asn_prefix)
class BarcodePlugin(ConsumeTaskPlugin):
NAME: str = "BarcodePlugin"
@property
def able_to_run(self) -> bool:
"""
Able to run if:
- ASN from barcode detection is enabled or
- Barcode support is enabled and the mime type is supported
"""
if self.settings.barcode_enable_tiff_support:
supported_mimes: set[str] = {"application/pdf", "image/tiff"}
else:
supported_mimes = {"application/pdf"}
return (
self.settings.barcode_enable_asn
or self.settings.barcodes_enabled
or self.settings.barcode_enable_tag
) and self.input_doc.mime_type in supported_mimes
def get_settings(self) -> BarcodeConfig:
"""
Returns the settings for this plugin (Django settings or app config)
"""
return BarcodeConfig()
def __init__(
self,
input_doc: ConsumableDocument,
metadata: DocumentMetadataOverrides,
status_mgr: ProgressManager,
base_tmp_dir: Path,
task_id: str,
) -> None:
super().__init__(
input_doc,
metadata,
status_mgr,
base_tmp_dir,
task_id,
)
# need these for able_to_run
self.settings = self.get_settings()
def setup(self) -> None:
self.temp_dir = tempfile.TemporaryDirectory(
dir=self.base_tmp_dir,
prefix="barcode",
)
self.pdf_file: Path = self.input_doc.original_file
self._tiff_conversion_done = False
self.barcodes: list[Barcode] = []
def run(self) -> None:
# Some operations may use PIL, override pixel setting if needed
maybe_override_pixel_limit()
# Maybe do the conversion of TIFF to PDF
self.convert_from_tiff_to_pdf()
# Locate any barcodes in the files
self.detect()
# try reading tags from barcodes
if (
self.settings.barcode_enable_tag
and (tags := self.tags) is not None
and len(tags) > 0
):
if self.metadata.tag_ids:
self.metadata.tag_ids += tags
else:
self.metadata.tag_ids = tags
logger.info(f"Found tags in barcode: {tags}")
# Lastly attempt to split documents
if self.settings.barcodes_enabled and (
separator_pages := self.get_separation_pages()
):
# We have pages to split against
# Note this does NOT use the base_temp_dir, as that will be removed
tmp_dir = Path(
tempfile.mkdtemp(
dir=settings.SCRATCH_DIR,
prefix="paperless-barcode-split-",
),
).resolve()
from documents import tasks
# Create the split document tasks
for new_document in self.separate_pages(separator_pages):
copy_file_with_basic_stats(new_document, tmp_dir / new_document.name)
task = tasks.consume_file.delay(
ConsumableDocument(
# Same source, for templates
source=self.input_doc.source,
mailrule_id=self.input_doc.mailrule_id,
# Can't use same folder or the consume might grab it again
original_file=(tmp_dir / new_document.name).resolve(),
# Adding optional original_path for later uses in
# workflow matching
original_path=self.input_doc.original_file,
),
# All the same metadata
self.metadata,
)
logger.info(f"Created new task {task.id} for {new_document.name}")
# This file is now two or more files
self.input_doc.original_file.unlink()
msg = "Barcode splitting complete!"
# Update the progress to complete
self.status_mgr.send_progress(ProgressStatusOptions.SUCCESS, msg, 100, 100)
# Request the consume task stops
raise StopConsumeTaskError(msg)
# Update/overwrite an ASN if possible
# After splitting, as otherwise each split document gets the same ASN
if self.settings.barcode_enable_asn and (located_asn := self.asn) is not None:
logger.info(f"Found ASN in barcode: {located_asn}")
self.metadata.asn = located_asn
def cleanup(self) -> None:
self.temp_dir.cleanup()
def convert_from_tiff_to_pdf(self) -> None:
"""
May convert a TIFF image into a PDF, if the input is a TIFF and
the TIFF has not been made into a PDF
"""
# Nothing to do, pdf_file is already assigned correctly
if self.input_doc.mime_type != "image/tiff" or self._tiff_conversion_done:
return
self.pdf_file = convert_from_tiff_to_pdf(
self.input_doc.original_file,
Path(self.temp_dir.name),
)
self._tiff_conversion_done = True
@staticmethod
def read_barcodes_zxing(image: Image.Image) -> list[str]:
barcodes = []
import zxingcpp
detected_barcodes = zxingcpp.read_barcodes(image)
for barcode in detected_barcodes:
if barcode.text:
barcodes.append(barcode.text)
logger.debug(
f"Barcode of type {barcode.format} found: {barcode.text}",
)
return barcodes
@staticmethod
def read_barcodes_pyzbar(image: Image.Image) -> list[str]:
barcodes = []
from pyzbar import pyzbar
# Decode the barcode image
detected_barcodes = pyzbar.decode(image)
# Traverse through all the detected barcodes in image
for barcode in detected_barcodes:
if barcode.data:
decoded_barcode = barcode.data.decode("utf-8")
barcodes.append(decoded_barcode)
logger.debug(
f"Barcode of type {barcode.type} found: {decoded_barcode}",
)
return barcodes
def detect(self) -> None:
"""
Scan all pages of the PDF as images, updating barcodes and the pages
found on as we go
"""
# Bail if barcodes already exist
if self.barcodes:
return
# No op if not a TIFF
self.convert_from_tiff_to_pdf()
# Choose the library for reading
if settings.CONSUMER_BARCODE_SCANNER == "PYZBAR":
reader: Callable[[Image.Image], list[str]] = self.read_barcodes_pyzbar
logger.debug("Scanning for barcodes using PYZBAR")
else:
reader = self.read_barcodes_zxing
logger.debug("Scanning for barcodes using ZXING")
try:
# Read number of pages from pdf
with Pdf.open(self.pdf_file) as pdf:
num_of_pages = len(pdf.pages)
logger.debug(f"PDF has {num_of_pages} pages")
# Get limit from configuration
barcode_max_pages: int = (
num_of_pages
if self.settings.barcode_max_pages == 0
else self.settings.barcode_max_pages
)
if barcode_max_pages < num_of_pages: # pragma: no cover
logger.debug(
f"Barcodes detection will be limited to the first {barcode_max_pages} pages",
)
# Loop al page
for current_page_number in range(min(num_of_pages, barcode_max_pages)):
logger.debug(f"Processing page {current_page_number}")
# Convert page to image
page = convert_from_path(
self.pdf_file,
dpi=self.settings.barcode_dpi,
output_folder=self.temp_dir.name,
first_page=current_page_number + 1,
last_page=current_page_number + 1,
)[0]
# Remember filename, since it is lost by upscaling
page_filepath = Path(page.filename)
logger.debug(f"Image is at {page_filepath}")
# Upscale image if configured
factor = self.settings.barcode_upscale
if factor > 1.0:
logger.debug(
f"Upscaling image by {factor} for better barcode detection",
)
x, y = page.size
page = page.resize(
(round(x * factor), (round(y * factor))),
)
# Detect barcodes
for barcode_value in reader(page):
self.barcodes.append(
Barcode(current_page_number, barcode_value, self.settings),
)
# Delete temporary image file
page_filepath.unlink()
# Password protected files can't be checked
# This is the exception raised for those
except PasswordError as e:
logger.warning(
f"File is likely password protected, not checking for barcodes: {e}",
)
# This file is really borked, allow the consumption to continue
# but it may fail further on
except Exception as e: # pragma: no cover
logger.warning(
f"Exception during barcode scanning: {e}",
)
@property
def asn(self) -> int | None:
"""
Search the parsed barcodes for any ASNs.
The first barcode that starts with barcode_asn_prefix
is considered the ASN to be used.
Returns the detected ASN (or None)
"""
asn = None
# Ensure the barcodes have been read
self.detect()
# get the first barcode that starts with barcode_asn_prefix
asn_text: str | None = next(
(x.value for x in self.barcodes if x.is_asn),
None,
)
if asn_text:
logger.debug(f"Found ASN Barcode: {asn_text}")
# remove the prefix and remove whitespace
asn_text = asn_text[len(self.settings.barcode_asn_prefix) :].strip()
# remove non-numeric parts of the remaining string
asn_text = re.sub(r"\D", "", asn_text)
# now, try parsing the ASN number
try:
asn = int(asn_text)
except ValueError as e:
logger.warning(f"Failed to parse ASN number because: {e}")
return asn
@property
def tags(self) -> list[int]:
"""
Search the parsed barcodes for any tags.
Returns the detected tag ids (or empty list)
"""
tags: list[int] = []
# Ensure the barcodes have been read
self.detect()
for x in self.barcodes:
tag_texts: str = x.value
for raw in tag_texts.split(","):
try:
tag_str: str | None = None
for regex in self.settings.barcode_tag_mapping:
if re.match(regex, raw, flags=re.IGNORECASE):
sub = self.settings.barcode_tag_mapping[regex]
tag_str = (
re.sub(regex, sub, raw, flags=re.IGNORECASE)
if sub
else raw
)
break
if tag_str:
tag, _ = Tag.objects.get_or_create(
name__iexact=tag_str,
defaults={"name": tag_str},
)
logger.debug(
f"Found Tag Barcode '{raw}', substituted "
f"to '{tag}' and mapped to "
f"tag #{tag.pk}.",
)
tags.append(tag.pk)
except Exception as e:
logger.error(
f"Failed to find or create TAG '{raw}' because: {e}",
)
return tags
def get_separation_pages(self) -> dict[int, bool]:
"""
Search the parsed barcodes for separators and returns a dict of page
numbers, which separate the file into new files, together with the
information whether to keep the page.
"""
# filter all barcodes for the separator string
# get the page numbers of the separating barcodes
retain = self.settings.barcode_retain_split_pages
separator_pages = {
bc.page: retain
for bc in self.barcodes
if bc.is_separator and (not retain or (retain and bc.page > 0))
} # as below, dont include the first page if retain is enabled
if not self.settings.barcode_enable_asn:
return separator_pages
# add the page numbers of the ASN barcodes
# (except for first page, that might lead to infinite loops).
return {
**separator_pages,
**{bc.page: True for bc in self.barcodes if bc.is_asn and bc.page != 0},
}
def separate_pages(self, pages_to_split_on: dict[int, bool]) -> list[Path]:
"""
Separate the provided pdf file on the pages_to_split_on.
The pages which are defined by the keys in page_numbers
will be removed if the corresponding value is false.
Returns a list of (temporary) filepaths to consume.
These will need to be deleted later.
"""
document_paths = []
fname: str = self.input_doc.original_file.stem
with Pdf.open(self.pdf_file) as input_pdf:
# Start with an empty document
current_document: list[Page] = []
# A list of documents, ie a list of lists of pages
documents: list[list[Page]] = [current_document]
for idx, page in enumerate(input_pdf.pages):
# Keep building the new PDF as long as it is not a
# separator index
if idx not in pages_to_split_on:
current_document.append(page)
continue
# This is a split index
# Start a new destination page listing
logger.debug(f"Starting new document at idx {idx}")
current_document = []
documents.append(current_document)
keep_page: bool = pages_to_split_on[idx]
if keep_page:
# Keep the page
# (new document is started by asn barcode)
current_document.append(page)
documents = [x for x in documents if len(x)]
logger.debug(f"Split into {len(documents)} new documents")
# Write the new documents out
for doc_idx, document in enumerate(documents):
dst = Pdf.new()
dst.pages.extend(document)
output_filename = f"{fname}_document_{doc_idx}.pdf"
logger.debug(f"pdf no:{doc_idx} has {len(dst.pages)} pages")
savepath = Path(self.temp_dir.name) / output_filename
with savepath.open("wb") as out:
dst.save(out)
copy_basic_file_stats(self.input_doc.original_file, savepath)
document_paths.append(savepath)
return document_paths

View File

@@ -0,0 +1,107 @@
from __future__ import annotations
from pathlib import Path
from typing import TYPE_CHECKING
from typing import NoReturn
if TYPE_CHECKING:
from collections.abc import Callable
from zipfile import ZipFile
from documents.models import Document
class BulkArchiveStrategy:
def __init__(self, zipf: ZipFile, *, follow_formatting: bool = False) -> None:
self.zipf: ZipFile = zipf
if follow_formatting:
self.make_unique_filename: Callable[..., Path | str] = (
self._formatted_filepath
)
else:
self.make_unique_filename = self._filename_only
def _filename_only(
self,
doc: Document,
*,
archive: bool = False,
folder: str = "",
) -> str:
"""
Constructs a unique name for the given document to be used inside the
zip file.
The filename might not be unique enough, so a counter is appended if needed
"""
counter = 0
while True:
filename: str = folder + doc.get_public_filename(
archive=archive,
counter=counter,
)
if filename in self.zipf.namelist():
counter += 1
else:
return filename
def _formatted_filepath(
self,
doc: Document,
*,
archive: bool = False,
folder: str = "",
) -> Path:
"""
Constructs a full file path for the given document to be used inside
the zipfile.
The path is already unique, as handled when a document is consumed or updated
"""
if archive and doc.has_archive_version:
if TYPE_CHECKING:
assert doc.archive_filename is not None
in_archive_path: Path = Path(folder) / doc.archive_filename
else:
if TYPE_CHECKING:
assert doc.filename is not None
in_archive_path = Path(folder) / doc.filename
return in_archive_path
def add_document(self, doc: Document) -> NoReturn:
raise NotImplementedError # pragma: no cover
class OriginalsOnlyStrategy(BulkArchiveStrategy):
def add_document(self, doc: Document) -> None:
self.zipf.write(doc.source_path, self.make_unique_filename(doc))
class ArchiveOnlyStrategy(BulkArchiveStrategy):
def add_document(self, doc: Document) -> None:
if doc.has_archive_version:
if TYPE_CHECKING:
assert doc.archive_path is not None
self.zipf.write(
doc.archive_path,
self.make_unique_filename(doc, archive=True),
)
else:
self.zipf.write(doc.source_path, self.make_unique_filename(doc))
class OriginalAndArchiveStrategy(BulkArchiveStrategy):
def add_document(self, doc: Document) -> None:
if doc.has_archive_version:
if TYPE_CHECKING:
assert doc.archive_path is not None
self.zipf.write(
doc.archive_path,
self.make_unique_filename(doc, archive=True, folder="archive/"),
)
self.zipf.write(
doc.source_path,
self.make_unique_filename(doc, folder="originals/"),
)

728
src/documents/bulk_edit.py Normal file
View File

@@ -0,0 +1,728 @@
from __future__ import annotations
import hashlib
import logging
import tempfile
from pathlib import Path
from typing import TYPE_CHECKING
from typing import Literal
from celery import chain
from celery import chord
from celery import group
from celery import shared_task
from django.conf import settings
from django.db import transaction
from django.db.models import Q
from django.utils import timezone
from documents.data_models import ConsumableDocument
from documents.data_models import DocumentMetadataOverrides
from documents.data_models import DocumentSource
from documents.models import Correspondent
from documents.models import CustomField
from documents.models import CustomFieldInstance
from documents.models import Document
from documents.models import DocumentType
from documents.models import StoragePath
from documents.models import Tag
from documents.permissions import set_permissions_for_object
from documents.plugins.helpers import DocumentsStatusManager
from documents.tasks import bulk_update_documents
from documents.tasks import consume_file
from documents.tasks import update_document_content_maybe_archive_file
if TYPE_CHECKING:
from django.contrib.auth.models import User
logger: logging.Logger = logging.getLogger("paperless.bulk_edit")
def set_correspondent(
doc_ids: list[int],
correspondent: Correspondent,
) -> Literal["OK"]:
if correspondent:
correspondent = Correspondent.objects.only("pk").get(id=correspondent)
qs = (
Document.objects.filter(Q(id__in=doc_ids) & ~Q(correspondent=correspondent))
.select_related("correspondent")
.only("pk", "correspondent__id")
)
affected_docs = list(qs.values_list("pk", flat=True))
qs.update(correspondent=correspondent)
bulk_update_documents.delay(document_ids=affected_docs)
return "OK"
def set_storage_path(doc_ids: list[int], storage_path: StoragePath) -> Literal["OK"]:
if storage_path:
storage_path = StoragePath.objects.only("pk").get(id=storage_path)
qs = (
Document.objects.filter(
Q(id__in=doc_ids) & ~Q(storage_path=storage_path),
)
.select_related("storage_path")
.only("pk", "storage_path__id")
)
affected_docs = list(qs.values_list("pk", flat=True))
qs.update(storage_path=storage_path)
bulk_update_documents.delay(
document_ids=affected_docs,
)
return "OK"
def set_document_type(doc_ids: list[int], document_type: DocumentType) -> Literal["OK"]:
if document_type:
document_type = DocumentType.objects.only("pk").get(id=document_type)
qs = (
Document.objects.filter(Q(id__in=doc_ids) & ~Q(document_type=document_type))
.select_related("document_type")
.only("pk", "document_type__id")
)
affected_docs = list(qs.values_list("pk", flat=True))
qs.update(document_type=document_type)
bulk_update_documents.delay(document_ids=affected_docs)
return "OK"
def add_tag(doc_ids: list[int], tag: int) -> Literal["OK"]:
tag_obj = Tag.objects.get(pk=tag)
tags_to_add = [tag_obj, *tag_obj.get_ancestors()]
DocumentTagRelationship = Document.tags.through
to_create = []
affected_docs: set[int] = set()
for t in tags_to_add:
qs = Document.objects.filter(Q(id__in=doc_ids) & ~Q(tags__id=t.id)).only("pk")
doc_ids_missing_tag = list(qs.values_list("pk", flat=True))
affected_docs.update(doc_ids_missing_tag)
to_create.extend(
DocumentTagRelationship(document_id=doc, tag_id=t.id)
for doc in doc_ids_missing_tag
)
if to_create:
DocumentTagRelationship.objects.bulk_create(to_create)
if affected_docs:
bulk_update_documents.delay(document_ids=list(affected_docs))
return "OK"
def remove_tag(doc_ids: list[int], tag: int) -> Literal["OK"]:
tag_obj = Tag.objects.get(pk=tag)
tag_ids = [tag_obj.id, *tag_obj.get_descendants_pks()]
DocumentTagRelationship = Document.tags.through
qs = DocumentTagRelationship.objects.filter(
document_id__in=doc_ids,
tag_id__in=tag_ids,
)
affected_docs = list(qs.values_list("document_id", flat=True).distinct())
qs.delete()
if affected_docs:
bulk_update_documents.delay(document_ids=affected_docs)
return "OK"
def modify_tags(
doc_ids: list[int],
add_tags: list[int],
remove_tags: list[int],
) -> Literal["OK"]:
qs = Document.objects.filter(id__in=doc_ids).only("pk")
affected_docs = list(qs.values_list("pk", flat=True))
DocumentTagRelationship = Document.tags.through
# add with all ancestors
expanded_add_tags: set[int] = set()
add_tag_objects = Tag.objects.filter(pk__in=add_tags)
for t in add_tag_objects:
expanded_add_tags.add(int(t.id))
expanded_add_tags.update(int(pk) for pk in t.get_ancestors_pks())
# remove with all descendants
expanded_remove_tags: set[int] = set()
remove_tag_objects = Tag.objects.filter(pk__in=remove_tags)
for t in remove_tag_objects:
expanded_remove_tags.add(int(t.id))
expanded_remove_tags.update(int(pk) for pk in t.get_descendants_pks())
try:
with transaction.atomic():
if expanded_remove_tags:
DocumentTagRelationship.objects.filter(
document_id__in=affected_docs,
tag_id__in=expanded_remove_tags,
).delete()
to_create = []
if expanded_add_tags:
existing_pairs = set(
DocumentTagRelationship.objects.filter(
document_id__in=affected_docs,
tag_id__in=expanded_add_tags,
).values_list("document_id", "tag_id"),
)
to_create = [
DocumentTagRelationship(document_id=doc, tag_id=tag)
for doc in affected_docs
for tag in expanded_add_tags
if (doc, tag) not in existing_pairs
]
if to_create:
DocumentTagRelationship.objects.bulk_create(
to_create,
ignore_conflicts=True,
)
if affected_docs:
bulk_update_documents.delay(document_ids=affected_docs)
except Exception as e:
logger.error(f"Error modifying tags: {e}")
return "ERROR"
return "OK"
def modify_custom_fields(
doc_ids: list[int],
add_custom_fields: list[int] | dict,
remove_custom_fields: list[int],
) -> Literal["OK"]:
qs = Document.objects.filter(id__in=doc_ids).only("pk")
affected_docs = list(qs.values_list("pk", flat=True))
# Ensure add_custom_fields is a list of tuples, supports old API
add_custom_fields = (
add_custom_fields.items()
if isinstance(add_custom_fields, dict)
else [(field, None) for field in add_custom_fields]
)
custom_fields = CustomField.objects.filter(
id__in=[int(field) for field, _ in add_custom_fields],
).distinct()
for field_id, value in add_custom_fields:
for doc_id in affected_docs:
defaults = {}
custom_field = custom_fields.get(id=field_id)
if custom_field:
value_field = CustomFieldInstance.TYPE_TO_DATA_STORE_NAME_MAP[
custom_field.data_type
]
defaults[value_field] = value
if (
custom_field.data_type == CustomField.FieldDataType.DOCUMENTLINK
and value
and doc_id in value
):
# Prevent self-linking
continue
CustomFieldInstance.objects.update_or_create(
document_id=doc_id,
field_id=field_id,
defaults=defaults,
)
if custom_field.data_type == CustomField.FieldDataType.DOCUMENTLINK:
doc = Document.objects.get(id=doc_id)
reflect_doclinks(doc, custom_field, value)
# For doc link fields that are being removed, remove symmetrical links
for doclink_being_removed_instance in CustomFieldInstance.objects.filter(
document_id__in=affected_docs,
field__id__in=remove_custom_fields,
field__data_type=CustomField.FieldDataType.DOCUMENTLINK,
value_document_ids__isnull=False,
):
for target_doc_id in doclink_being_removed_instance.value:
remove_doclink(
document=Document.objects.get(
id=doclink_being_removed_instance.document.id,
),
field=doclink_being_removed_instance.field,
target_doc_id=target_doc_id,
)
# Finally, remove the custom fields
CustomFieldInstance.objects.filter(
document_id__in=affected_docs,
field_id__in=remove_custom_fields,
).hard_delete()
bulk_update_documents.delay(document_ids=affected_docs)
return "OK"
@shared_task
def delete(doc_ids: list[int]) -> Literal["OK"]:
try:
Document.objects.filter(id__in=doc_ids).delete()
from documents import index
with index.open_index_writer() as writer:
for id in doc_ids:
index.remove_document_by_id(writer, id)
status_mgr = DocumentsStatusManager()
status_mgr.send_documents_deleted(doc_ids)
except Exception as e:
if "Data too long for column" in str(e):
logger.warning(
"Detected a possible incompatible database column. See https://docs.paperless-ngx.com/troubleshooting/#convert-uuid-field",
)
logger.error(f"Error deleting documents: {e!s}")
return "OK"
def reprocess(doc_ids: list[int]) -> Literal["OK"]:
for document_id in doc_ids:
update_document_content_maybe_archive_file.delay(
document_id=document_id,
)
return "OK"
def set_permissions(
doc_ids: list[int],
set_permissions,
*,
owner=None,
merge=False,
) -> Literal["OK"]:
qs = Document.objects.filter(id__in=doc_ids).select_related("owner")
if merge:
# If merging, only set owner for documents that don't have an owner
qs.filter(owner__isnull=True).update(owner=owner)
else:
qs.update(owner=owner)
for doc in qs:
set_permissions_for_object(permissions=set_permissions, object=doc, merge=merge)
affected_docs = list(qs.values_list("pk", flat=True))
bulk_update_documents.delay(document_ids=affected_docs)
return "OK"
def rotate(doc_ids: list[int], degrees: int) -> Literal["OK"]:
logger.info(
f"Attempting to rotate {len(doc_ids)} documents by {degrees} degrees.",
)
qs = Document.objects.filter(id__in=doc_ids)
affected_docs: list[int] = []
import pikepdf
rotate_tasks = []
for doc in qs:
if doc.mime_type != "application/pdf":
logger.warning(
f"Document {doc.id} is not a PDF, skipping rotation.",
)
continue
try:
with pikepdf.open(doc.source_path, allow_overwriting_input=True) as pdf:
for page in pdf.pages:
page.rotate(degrees, relative=True)
pdf.save()
doc.checksum = hashlib.md5(doc.source_path.read_bytes()).hexdigest()
doc.save()
rotate_tasks.append(
update_document_content_maybe_archive_file.s(
document_id=doc.id,
),
)
logger.info(
f"Rotated document {doc.id} by {degrees} degrees",
)
affected_docs.append(doc.id)
except Exception as e:
logger.exception(f"Error rotating document {doc.id}: {e}")
if len(affected_docs) > 0:
bulk_update_task = bulk_update_documents.si(document_ids=affected_docs)
chord(header=rotate_tasks, body=bulk_update_task).delay()
return "OK"
def merge(
doc_ids: list[int],
*,
metadata_document_id: int | None = None,
delete_originals: bool = False,
archive_fallback: bool = False,
user: User | None = None,
) -> Literal["OK"]:
logger.info(
f"Attempting to merge {len(doc_ids)} documents into a single document.",
)
qs = Document.objects.filter(id__in=doc_ids)
affected_docs: list[int] = []
import pikepdf
merged_pdf = pikepdf.new()
version: str = merged_pdf.pdf_version
# use doc_ids to preserve order
for doc_id in doc_ids:
doc = qs.get(id=doc_id)
try:
doc_path = (
doc.archive_path
if archive_fallback
and doc.mime_type != "application/pdf"
and doc.has_archive_version
else doc.source_path
)
with pikepdf.open(str(doc_path)) as pdf:
version = max(version, pdf.pdf_version)
merged_pdf.pages.extend(pdf.pages)
affected_docs.append(doc.id)
except Exception as e:
logger.exception(
f"Error merging document {doc.id}, it will not be included in the merge: {e}",
)
if len(affected_docs) == 0:
logger.warning("No documents were merged")
return "OK"
filepath = (
Path(
tempfile.mkdtemp(dir=settings.SCRATCH_DIR),
)
/ f"{'_'.join([str(doc_id) for doc_id in affected_docs])[:100]}_merged.pdf"
)
merged_pdf.remove_unreferenced_resources()
merged_pdf.save(filepath, min_version=version)
merged_pdf.close()
if metadata_document_id:
metadata_document = qs.get(id=metadata_document_id)
if metadata_document is not None:
overrides: DocumentMetadataOverrides = (
DocumentMetadataOverrides.from_document(metadata_document)
)
overrides.title = metadata_document.title + " (merged)"
else:
overrides = DocumentMetadataOverrides()
else:
overrides = DocumentMetadataOverrides()
if user is not None:
overrides.owner_id = user.id
logger.info("Adding merged document to the task queue.")
consume_task = consume_file.s(
ConsumableDocument(
source=DocumentSource.ConsumeFolder,
original_file=filepath,
),
overrides,
)
if delete_originals:
logger.info(
"Queueing removal of original documents after consumption of merged document",
)
chain(consume_task, delete.si(affected_docs)).delay()
else:
consume_task.delay()
return "OK"
def split(
doc_ids: list[int],
pages: list[list[int]],
*,
delete_originals: bool = False,
user: User | None = None,
) -> Literal["OK"]:
logger.info(
f"Attempting to split document {doc_ids[0]} into {len(pages)} documents",
)
doc = Document.objects.get(id=doc_ids[0])
import pikepdf
consume_tasks = []
try:
with pikepdf.open(doc.source_path) as pdf:
for idx, split_doc in enumerate(pages):
dst: pikepdf.Pdf = pikepdf.new()
for page in split_doc:
dst.pages.append(pdf.pages[page - 1])
filepath: Path = (
Path(
tempfile.mkdtemp(dir=settings.SCRATCH_DIR),
)
/ f"{doc.id}_{split_doc[0]}-{split_doc[-1]}.pdf"
)
dst.remove_unreferenced_resources()
dst.save(filepath)
dst.close()
overrides: DocumentMetadataOverrides = (
DocumentMetadataOverrides().from_document(doc)
)
overrides.title = f"{doc.title} (split {idx + 1})"
if user is not None:
overrides.owner_id = user.id
logger.info(
f"Adding split document with pages {split_doc} to the task queue.",
)
consume_tasks.append(
consume_file.s(
ConsumableDocument(
source=DocumentSource.ConsumeFolder,
original_file=filepath,
),
overrides,
),
)
if delete_originals:
logger.info(
"Queueing removal of original document after consumption of the split documents",
)
chord(header=consume_tasks, body=delete.si([doc.id])).delay()
else:
group(consume_tasks).delay()
except Exception as e:
logger.exception(f"Error splitting document {doc.id}: {e}")
return "OK"
def delete_pages(doc_ids: list[int], pages: list[int]) -> Literal["OK"]:
logger.info(
f"Attempting to delete pages {pages} from {len(doc_ids)} documents",
)
doc = Document.objects.get(id=doc_ids[0])
pages = sorted(pages) # sort pages to avoid index issues
import pikepdf
try:
with pikepdf.open(doc.source_path, allow_overwriting_input=True) as pdf:
offset = 1 # pages are 1-indexed
for page_num in pages:
pdf.pages.remove(pdf.pages[page_num - offset])
offset += 1 # remove() changes the index of the pages
pdf.remove_unreferenced_resources()
pdf.save()
doc.checksum = hashlib.md5(doc.source_path.read_bytes()).hexdigest()
if doc.page_count is not None:
doc.page_count = doc.page_count - len(pages)
doc.save()
update_document_content_maybe_archive_file.delay(document_id=doc.id)
logger.info(f"Deleted pages {pages} from document {doc.id}")
except Exception as e:
logger.exception(f"Error deleting pages from document {doc.id}: {e}")
return "OK"
def edit_pdf(
doc_ids: list[int],
operations: list[dict],
*,
delete_original: bool = False,
update_document: bool = False,
include_metadata: bool = True,
user: User | None = None,
) -> Literal["OK"]:
"""
Operations is a list of dictionaries describing the final PDF pages.
Each entry must contain the original page number in `page` and may
specify `rotate` in degrees and `doc` indicating the output
document index (for splitting). Pages omitted from the list are
discarded.
"""
logger.info(
f"Editing PDF of document {doc_ids[0]} with {len(operations)} operations",
)
doc = Document.objects.get(id=doc_ids[0])
import pikepdf
pdf_docs: list[pikepdf.Pdf] = []
try:
with pikepdf.open(doc.source_path) as src:
# prepare output documents
max_idx = max(op.get("doc", 0) for op in operations)
pdf_docs = [pikepdf.new() for _ in range(max_idx + 1)]
if update_document and len(pdf_docs) > 1:
logger.error(
"Update requested but multiple output documents specified",
)
raise ValueError("Multiple output documents specified")
for op in operations:
dst = pdf_docs[op.get("doc", 0)]
page = src.pages[op["page"] - 1]
dst.pages.append(page)
if op.get("rotate"):
dst.pages[-1].rotate(op["rotate"], relative=True)
if update_document:
temp_path = doc.source_path.with_suffix(".tmp.pdf")
pdf = pdf_docs[0]
pdf.remove_unreferenced_resources()
# save the edited PDF to a temporary file in case of errors
pdf.save(temp_path)
# replace the original document with the edited one
temp_path.replace(doc.source_path)
doc.checksum = hashlib.md5(doc.source_path.read_bytes()).hexdigest()
doc.page_count = len(pdf.pages)
doc.save()
update_document_content_maybe_archive_file.delay(document_id=doc.id)
else:
consume_tasks = []
overrides = (
DocumentMetadataOverrides().from_document(doc)
if include_metadata
else DocumentMetadataOverrides()
)
if user is not None:
overrides.owner_id = user.id
for idx, pdf in enumerate(pdf_docs, start=1):
filepath: Path = (
Path(tempfile.mkdtemp(dir=settings.SCRATCH_DIR))
/ f"{doc.id}_edit_{idx}.pdf"
)
pdf.remove_unreferenced_resources()
pdf.save(filepath)
consume_tasks.append(
consume_file.s(
ConsumableDocument(
source=DocumentSource.ConsumeFolder,
original_file=filepath,
),
overrides,
),
)
if delete_original:
chord(header=consume_tasks, body=delete.si([doc.id])).delay()
else:
group(consume_tasks).delay()
except Exception as e:
logger.exception(f"Error editing document {doc.id}: {e}")
raise ValueError(
f"An error occurred while editing the document: {e}",
) from e
return "OK"
def reflect_doclinks(
document: Document,
field: CustomField,
target_doc_ids: list[int],
):
"""
Add or remove 'symmetrical' links to `document` on all `target_doc_ids`
"""
if target_doc_ids is None:
target_doc_ids = []
# Check if any documents are going to be removed from the current list of links and remove the symmetrical links
current_field_instance = CustomFieldInstance.objects.filter(
field=field,
document=document,
).first()
if current_field_instance is not None and current_field_instance.value is not None:
for doc_id in current_field_instance.value:
if doc_id not in target_doc_ids:
remove_doclink(
document=document,
field=field,
target_doc_id=doc_id,
)
# Create an instance if target doc doesn't have this field or append it to an existing one
existing_custom_field_instances = {
custom_field.document_id: custom_field
for custom_field in CustomFieldInstance.objects.filter(
field=field,
document_id__in=target_doc_ids,
)
}
custom_field_instances_to_create = []
custom_field_instances_to_update = []
for target_doc_id in target_doc_ids:
target_doc_field_instance = existing_custom_field_instances.get(
target_doc_id,
)
if target_doc_field_instance is None:
custom_field_instances_to_create.append(
CustomFieldInstance(
document_id=target_doc_id,
field=field,
value_document_ids=[document.id],
),
)
elif target_doc_field_instance.value is None:
target_doc_field_instance.value_document_ids = [document.id]
custom_field_instances_to_update.append(target_doc_field_instance)
elif document.id not in target_doc_field_instance.value:
target_doc_field_instance.value_document_ids.append(document.id)
custom_field_instances_to_update.append(target_doc_field_instance)
CustomFieldInstance.objects.bulk_create(custom_field_instances_to_create)
CustomFieldInstance.objects.bulk_update(
custom_field_instances_to_update,
["value_document_ids"],
)
Document.objects.filter(id__in=target_doc_ids).update(modified=timezone.now())
def remove_doclink(
document: Document,
field: CustomField,
target_doc_id: int,
):
"""
Removes a 'symmetrical' link to `document` from the target document's existing custom field instance
"""
target_doc_field_instance = CustomFieldInstance.objects.filter(
document_id=target_doc_id,
field=field,
).first()
if (
target_doc_field_instance is not None
and document.id in target_doc_field_instance.value
):
target_doc_field_instance.value.remove(document.id)
target_doc_field_instance.save()
Document.objects.filter(id=target_doc_id).update(modified=timezone.now())

296
src/documents/caching.py Normal file
View File

@@ -0,0 +1,296 @@
from __future__ import annotations
import logging
import pickle
from binascii import hexlify
from collections import OrderedDict
from dataclasses import dataclass
from typing import TYPE_CHECKING
from typing import Any
from typing import Final
from django.conf import settings
from django.core.cache import cache
from django.core.cache import caches
from documents.models import Document
if TYPE_CHECKING:
from django.core.cache.backends.base import BaseCache
from documents.classifier import DocumentClassifier
logger = logging.getLogger("paperless.caching")
@dataclass(frozen=True)
class MetadataCacheData:
original_checksum: str
original_metadata: list
archive_checksum: str | None
archive_metadata: list | None
@dataclass(frozen=True)
class SuggestionCacheData:
classifier_version: int
classifier_hash: str
suggestions: dict
CLASSIFIER_VERSION_KEY: Final[str] = "classifier_version"
CLASSIFIER_HASH_KEY: Final[str] = "classifier_hash"
CLASSIFIER_MODIFIED_KEY: Final[str] = "classifier_modified"
CACHE_1_MINUTE: Final[int] = 60
CACHE_5_MINUTES: Final[int] = 5 * CACHE_1_MINUTE
CACHE_50_MINUTES: Final[int] = 50 * CACHE_1_MINUTE
read_cache = caches["read-cache"]
class LRUCache:
def __init__(self, capacity: int = 128):
self._data = OrderedDict()
self.capacity = capacity
def get(self, key, default=None) -> Any | None:
if key in self._data:
self._data.move_to_end(key)
return self._data[key]
return default
def set(self, key, value) -> None:
self._data[key] = value
self._data.move_to_end(key)
while len(self._data) > self.capacity:
self._data.popitem(last=False)
class StoredLRUCache(LRUCache):
"""
LRU cache that can persist its entire contents as a single entry in a backend cache.
Useful for sharing a cache across multiple workers or processes.
Workflow:
1. Load the cache state from the backend using `load()`.
2. Use `get()` and `set()` locally as usual.
3. Persist changes back to the backend using `save()`.
"""
def __init__(
self,
backend_key: str,
capacity: int = 128,
backend: BaseCache = read_cache,
backend_ttl=settings.CACHALOT_TIMEOUT,
):
if backend_key is None:
raise ValueError("backend_key is mandatory")
super().__init__(capacity)
self._backend_key = backend_key
self._backend = backend
self.backend_ttl = backend_ttl
def load(self) -> None:
"""
Load the whole cache content from backend storage.
If no valid cached data exists in the backend, the local cache is cleared.
"""
serialized_data = self._backend.get(self._backend_key)
try:
self._data = (
pickle.loads(serialized_data) if serialized_data else OrderedDict()
)
except pickle.PickleError:
logger.warning(
"Cache exists in backend but could not be read (possibly invalid format)",
)
def save(self) -> None:
"""Save the entire local cache to the backend as a serialized object.
The backend entry will expire after the configured TTL.
"""
self._backend.set(
self._backend_key,
pickle.dumps(self._data),
self.backend_ttl,
)
def get_suggestion_cache_key(document_id: int) -> str:
"""
Returns the basic key for a document's suggestions
"""
return f"doc_{document_id}_suggest"
def get_suggestion_cache(document_id: int) -> SuggestionCacheData | None:
"""
If possible, return the cached suggestions for the given document ID.
The classifier needs to be matching in format and hash and the suggestions need to
have been cached once.
"""
from documents.classifier import DocumentClassifier
doc_key = get_suggestion_cache_key(document_id)
cache_hits = cache.get_many([CLASSIFIER_VERSION_KEY, CLASSIFIER_HASH_KEY, doc_key])
# The document suggestions are in the cache
if doc_key in cache_hits:
doc_suggestions: SuggestionCacheData = cache_hits[doc_key]
# The classifier format is the same
# The classifier hash is the same
# Then the suggestions can be used
if (
CLASSIFIER_VERSION_KEY in cache_hits
and cache_hits[CLASSIFIER_VERSION_KEY] == DocumentClassifier.FORMAT_VERSION
and cache_hits[CLASSIFIER_VERSION_KEY] == doc_suggestions.classifier_version
) and (
CLASSIFIER_HASH_KEY in cache_hits
and cache_hits[CLASSIFIER_HASH_KEY] == doc_suggestions.classifier_hash
):
return doc_suggestions
else: # pragma: no cover
# Remove the key because something didn't match
cache.delete(doc_key)
return None
def set_suggestions_cache(
document_id: int,
suggestions: dict,
classifier: DocumentClassifier | None,
*,
timeout=CACHE_50_MINUTES,
) -> None:
"""
Caches the given suggestions, which were generated by the given classifier. If there is no classifier,
this function is a no-op (there won't be suggestions then anyway)
"""
if classifier is not None:
doc_key = get_suggestion_cache_key(document_id)
cache.set(
doc_key,
SuggestionCacheData(
classifier.FORMAT_VERSION,
hexlify(classifier.last_auto_type_hash).decode(),
suggestions,
),
timeout,
)
def refresh_suggestions_cache(
document_id: int,
*,
timeout: int = CACHE_50_MINUTES,
) -> None:
"""
Refreshes the expiration of the suggestions for the given document ID
to the given timeout
"""
doc_key = get_suggestion_cache_key(document_id)
cache.touch(doc_key, timeout)
def get_metadata_cache_key(document_id: int) -> str:
"""
Returns the basic key for a document's metadata
"""
return f"doc_{document_id}_metadata"
def get_metadata_cache(document_id: int) -> MetadataCacheData | None:
"""
Returns the cached document metadata for the given document ID, as long as the metadata
was cached once and the checksums have not changed
"""
doc_key = get_metadata_cache_key(document_id)
doc_metadata: MetadataCacheData | None = cache.get(doc_key)
# The metadata exists in the cache
if doc_metadata is not None:
try:
doc = Document.objects.only(
"pk",
"checksum",
"archive_checksum",
"archive_filename",
).get(pk=document_id)
# The original checksums match
# If it has one, the archive checksums match
# Then, we can use the metadata
if (
doc_metadata.original_checksum == doc.checksum
and doc.has_archive_version
and doc_metadata.archive_checksum is not None
and doc_metadata.archive_checksum == doc.archive_checksum
):
# Refresh cache
cache.touch(doc_key, CACHE_50_MINUTES)
return doc_metadata
else: # pragma: no cover
# Something didn't match, delete the key
cache.delete(doc_key)
except Document.DoesNotExist: # pragma: no cover
# Basically impossible, but the key existed, but the Document didn't
cache.delete(doc_key)
return None
def set_metadata_cache(
document: Document,
original_metadata: list,
archive_metadata: list | None,
*,
timeout=CACHE_50_MINUTES,
) -> None:
"""
Sets the metadata into cache for the given Document
"""
doc_key = get_metadata_cache_key(document.pk)
cache.set(
doc_key,
MetadataCacheData(
document.checksum,
original_metadata,
document.archive_checksum,
archive_metadata,
),
timeout,
)
def refresh_metadata_cache(
document_id: int,
*,
timeout: int = CACHE_50_MINUTES,
) -> None:
"""
Refreshes the expiration of the metadata for the given document ID
to the given timeout
"""
doc_key = get_metadata_cache_key(document_id)
cache.touch(doc_key, timeout)
def get_thumbnail_modified_key(document_id: int) -> str:
"""
Builds the key to store a thumbnail's timestamp
"""
return f"doc_{document_id}_thumbnail_modified"
def clear_document_caches(document_id: int) -> None:
"""
Removes all cached items for the given document
"""
cache.delete_many(
[
get_suggestion_cache_key(document_id),
get_metadata_cache_key(document_id),
get_thumbnail_modified_key(document_id),
],
)

88
src/documents/checks.py Normal file
View File

@@ -0,0 +1,88 @@
import textwrap
from django.conf import settings
from django.core.checks import Error
from django.core.checks import Warning
from django.core.checks import register
from django.core.exceptions import FieldError
from django.db.utils import OperationalError
from django.db.utils import ProgrammingError
from documents.signals import document_consumer_declaration
from documents.templating.utils import convert_format_str_to_template_format
@register()
def changed_password_check(app_configs, **kwargs):
from documents.models import Document
from paperless.db import GnuPG
try:
encrypted_doc = (
Document.objects.filter(
storage_type=Document.STORAGE_TYPE_GPG,
)
.only("pk", "storage_type")
.first()
)
except (OperationalError, ProgrammingError, FieldError):
return [] # No documents table yet
if encrypted_doc:
if not settings.PASSPHRASE:
return [
Error(
"The database contains encrypted documents but no password is set.",
),
]
if not GnuPG.decrypted(encrypted_doc.source_file):
return [
Error(
textwrap.dedent(
"""
The current password doesn't match the password of the
existing documents.
If you intend to change your password, you must first export
all of the old documents, start fresh with the new password
and then re-import them."
""",
),
),
]
return []
@register()
def parser_check(app_configs, **kwargs):
parsers = []
for response in document_consumer_declaration.send(None):
parsers.append(response[1])
if len(parsers) == 0:
return [
Error(
"No parsers found. This is a bug. The consumer won't be "
"able to consume any documents without parsers.",
),
]
else:
return []
@register()
def filename_format_check(app_configs, **kwargs):
if settings.FILENAME_FORMAT:
converted_format = convert_format_str_to_template_format(
settings.FILENAME_FORMAT,
)
if converted_format != settings.FILENAME_FORMAT:
return [
Warning(
f"Filename format {settings.FILENAME_FORMAT} is using the old style, please update to use double curly brackets",
hint=converted_format,
),
]
return []

546
src/documents/classifier.py Normal file
View File

@@ -0,0 +1,546 @@
from __future__ import annotations
import logging
import pickle
import re
import warnings
from hashlib import sha256
from pathlib import Path
from typing import TYPE_CHECKING
if TYPE_CHECKING:
from collections.abc import Iterator
from datetime import datetime
from numpy import ndarray
from django.conf import settings
from django.core.cache import cache
from django.core.cache import caches
from documents.caching import CACHE_5_MINUTES
from documents.caching import CACHE_50_MINUTES
from documents.caching import CLASSIFIER_HASH_KEY
from documents.caching import CLASSIFIER_MODIFIED_KEY
from documents.caching import CLASSIFIER_VERSION_KEY
from documents.caching import StoredLRUCache
from documents.models import Document
from documents.models import MatchingModel
logger = logging.getLogger("paperless.classifier")
ADVANCED_TEXT_PROCESSING_ENABLED = (
settings.NLTK_LANGUAGE is not None and settings.NLTK_ENABLED
)
read_cache = caches["read-cache"]
RE_DIGIT = re.compile(r"\d")
RE_WORD = re.compile(r"\b[\w]+\b") # words that may contain digits
class IncompatibleClassifierVersionError(Exception):
def __init__(self, message: str, *args: object) -> None:
self.message: str = message
super().__init__(*args)
class ClassifierModelCorruptError(Exception):
pass
def load_classifier(*, raise_exception: bool = False) -> DocumentClassifier | None:
if not settings.MODEL_FILE.is_file():
logger.debug(
"Document classification model does not exist (yet), not "
"performing automatic matching.",
)
return None
classifier = DocumentClassifier()
try:
classifier.load()
except IncompatibleClassifierVersionError as e:
logger.info(f"Classifier version incompatible: {e.message}, will re-train")
Path(settings.MODEL_FILE).unlink()
classifier = None
if raise_exception:
raise e
except ClassifierModelCorruptError as e:
# there's something wrong with the model file.
logger.exception(
"Unrecoverable error while loading document "
"classification model, deleting model file.",
)
Path(settings.MODEL_FILE).unlink
classifier = None
if raise_exception:
raise e
except OSError as e:
logger.exception("IO error while loading document classification model")
classifier = None
if raise_exception:
raise e
except Exception as e: # pragma: no cover
logger.exception("Unknown error while loading document classification model")
classifier = None
if raise_exception:
raise e
return classifier
class DocumentClassifier:
# v7 - Updated scikit-learn package version
# v8 - Added storage path classifier
# v9 - Changed from hashing to time/ids for re-train check
FORMAT_VERSION = 9
def __init__(self) -> None:
# last time a document changed and therefore training might be required
self.last_doc_change_time: datetime | None = None
# Hash of primary keys of AUTO matching values last used in training
self.last_auto_type_hash: bytes | None = None
self.data_vectorizer = None
self.data_vectorizer_hash = None
self.tags_binarizer = None
self.tags_classifier = None
self.correspondent_classifier = None
self.document_type_classifier = None
self.storage_path_classifier = None
self._stemmer = None
# 10,000 elements roughly use 200 to 500 KB per worker,
# and also in the shared Redis cache,
# Keep this cache small to minimize lookup and I/O latency.
if ADVANCED_TEXT_PROCESSING_ENABLED:
self._stem_cache = StoredLRUCache(
f"stem_cache_v{self.FORMAT_VERSION}",
capacity=10000,
)
self._stop_words = None
def _update_data_vectorizer_hash(self):
self.data_vectorizer_hash = sha256(
pickle.dumps(self.data_vectorizer),
).hexdigest()
def load(self) -> None:
from sklearn.exceptions import InconsistentVersionWarning
# Catch warnings for processing
with warnings.catch_warnings(record=True) as w:
with Path(settings.MODEL_FILE).open("rb") as f:
schema_version = pickle.load(f)
if schema_version != self.FORMAT_VERSION:
raise IncompatibleClassifierVersionError(
"Cannot load classifier, incompatible versions.",
)
else:
try:
self.last_doc_change_time = pickle.load(f)
self.last_auto_type_hash = pickle.load(f)
self.data_vectorizer = pickle.load(f)
self._update_data_vectorizer_hash()
self.tags_binarizer = pickle.load(f)
self.tags_classifier = pickle.load(f)
self.correspondent_classifier = pickle.load(f)
self.document_type_classifier = pickle.load(f)
self.storage_path_classifier = pickle.load(f)
except Exception as err:
raise ClassifierModelCorruptError from err
# Check for the warning about unpickling from differing versions
# and consider it incompatible
sk_learn_warning_url = (
"https://scikit-learn.org/stable/"
"model_persistence.html"
"#security-maintainability-limitations"
)
for warning in w:
# The warning is inconsistent, the MLPClassifier is a specific warning, others have not updated yet
if issubclass(warning.category, InconsistentVersionWarning) or (
issubclass(warning.category, UserWarning)
and sk_learn_warning_url in str(warning.message)
):
raise IncompatibleClassifierVersionError("sklearn version update")
def save(self) -> None:
target_file: Path = settings.MODEL_FILE
target_file_temp: Path = target_file.with_suffix(".pickle.part")
with target_file_temp.open("wb") as f:
pickle.dump(self.FORMAT_VERSION, f)
pickle.dump(self.last_doc_change_time, f)
pickle.dump(self.last_auto_type_hash, f)
pickle.dump(self.data_vectorizer, f)
pickle.dump(self.tags_binarizer, f)
pickle.dump(self.tags_classifier, f)
pickle.dump(self.correspondent_classifier, f)
pickle.dump(self.document_type_classifier, f)
pickle.dump(self.storage_path_classifier, f)
target_file_temp.rename(target_file)
def train(self) -> bool:
# Get non-inbox documents
docs_queryset = (
Document.objects.exclude(
tags__is_inbox_tag=True,
)
.select_related("document_type", "correspondent", "storage_path")
.prefetch_related("tags")
.order_by("pk")
)
# No documents exit to train against
if docs_queryset.count() == 0:
raise ValueError("No training data available.")
labels_tags = []
labels_correspondent = []
labels_document_type = []
labels_storage_path = []
# Step 1: Extract and preprocess training data from the database.
logger.debug("Gathering data from database...")
hasher = sha256()
for doc in docs_queryset:
y = -1
dt = doc.document_type
if dt and dt.matching_algorithm == MatchingModel.MATCH_AUTO:
y = dt.pk
hasher.update(y.to_bytes(4, "little", signed=True))
labels_document_type.append(y)
y = -1
cor = doc.correspondent
if cor and cor.matching_algorithm == MatchingModel.MATCH_AUTO:
y = cor.pk
hasher.update(y.to_bytes(4, "little", signed=True))
labels_correspondent.append(y)
tags: list[int] = list(
doc.tags.filter(matching_algorithm=MatchingModel.MATCH_AUTO)
.order_by("pk")
.values_list("pk", flat=True),
)
for tag in tags:
hasher.update(tag.to_bytes(4, "little", signed=True))
labels_tags.append(tags)
y = -1
sp = doc.storage_path
if sp and sp.matching_algorithm == MatchingModel.MATCH_AUTO:
y = sp.pk
hasher.update(y.to_bytes(4, "little", signed=True))
labels_storage_path.append(y)
labels_tags_unique = {tag for tags in labels_tags for tag in tags}
num_tags = len(labels_tags_unique)
# Check if retraining is actually required.
# A document has been updated since the classifier was trained
# New auto tags, types, correspondent, storage paths exist
latest_doc_change = docs_queryset.latest("modified").modified
if (
self.last_doc_change_time is not None
and self.last_doc_change_time >= latest_doc_change
) and self.last_auto_type_hash == hasher.digest():
logger.info("No updates since last training")
# Set the classifier information into the cache
# Caching for 50 minutes, so slightly less than the normal retrain time
cache.set(
CLASSIFIER_MODIFIED_KEY,
self.last_doc_change_time,
CACHE_50_MINUTES,
)
cache.set(CLASSIFIER_HASH_KEY, hasher.hexdigest(), CACHE_50_MINUTES)
cache.set(CLASSIFIER_VERSION_KEY, self.FORMAT_VERSION, CACHE_50_MINUTES)
return False
# subtract 1 since -1 (null) is also part of the classes.
# union with {-1} accounts for cases where all documents have
# correspondents and types assigned, so -1 isn't part of labels_x, which
# it usually is.
num_correspondents: int = len(set(labels_correspondent) | {-1}) - 1
num_document_types: int = len(set(labels_document_type) | {-1}) - 1
num_storage_paths: int = len(set(labels_storage_path) | {-1}) - 1
logger.debug(
f"{docs_queryset.count()} documents, {num_tags} tag(s), {num_correspondents} correspondent(s), "
f"{num_document_types} document type(s). {num_storage_paths} storage path(s)",
)
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import MultiLabelBinarizer
# Step 2: vectorize data
logger.debug("Vectorizing data...")
def content_generator() -> Iterator[str]:
"""
Generates the content for documents, but once at a time
"""
for doc in docs_queryset:
yield self.preprocess_content(doc.content, shared_cache=False)
self.data_vectorizer = CountVectorizer(
analyzer="word",
ngram_range=(1, 2),
min_df=0.01,
)
data_vectorized: ndarray = self.data_vectorizer.fit_transform(
content_generator(),
)
# See the notes here:
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
# This attribute isn't needed to function and can be large
self.data_vectorizer.stop_words_ = None
# Step 3: train the classifiers
if num_tags > 0:
logger.debug("Training tags classifier...")
if num_tags == 1:
# Special case where only one tag has auto:
# Fallback to binary classification.
labels_tags = [
label[0] if len(label) == 1 else -1 for label in labels_tags
]
self.tags_binarizer = LabelBinarizer()
labels_tags_vectorized: ndarray = self.tags_binarizer.fit_transform(
labels_tags,
).ravel()
else:
self.tags_binarizer = MultiLabelBinarizer()
labels_tags_vectorized = self.tags_binarizer.fit_transform(labels_tags)
self.tags_classifier = MLPClassifier(tol=0.01)
self.tags_classifier.fit(data_vectorized, labels_tags_vectorized)
else:
self.tags_classifier = None
logger.debug("There are no tags. Not training tags classifier.")
if num_correspondents > 0:
logger.debug("Training correspondent classifier...")
self.correspondent_classifier = MLPClassifier(tol=0.01)
self.correspondent_classifier.fit(data_vectorized, labels_correspondent)
else:
self.correspondent_classifier = None
logger.debug(
"There are no correspondents. Not training correspondent classifier.",
)
if num_document_types > 0:
logger.debug("Training document type classifier...")
self.document_type_classifier = MLPClassifier(tol=0.01)
self.document_type_classifier.fit(data_vectorized, labels_document_type)
else:
self.document_type_classifier = None
logger.debug(
"There are no document types. Not training document type classifier.",
)
if num_storage_paths > 0:
logger.debug(
"Training storage paths classifier...",
)
self.storage_path_classifier = MLPClassifier(tol=0.01)
self.storage_path_classifier.fit(
data_vectorized,
labels_storage_path,
)
else:
self.storage_path_classifier = None
logger.debug(
"There are no storage paths. Not training storage path classifier.",
)
self.last_doc_change_time = latest_doc_change
self.last_auto_type_hash = hasher.digest()
self._update_data_vectorizer_hash()
# Set the classifier information into the cache
# Caching for 50 minutes, so slightly less than the normal retrain time
cache.set(CLASSIFIER_MODIFIED_KEY, self.last_doc_change_time, CACHE_50_MINUTES)
cache.set(CLASSIFIER_HASH_KEY, hasher.hexdigest(), CACHE_50_MINUTES)
cache.set(CLASSIFIER_VERSION_KEY, self.FORMAT_VERSION, CACHE_50_MINUTES)
return True
def _init_advanced_text_processing(self):
if self._stop_words is None or self._stemmer is None:
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
# Not really hacky, since it isn't private and is documented, but
# set the search path for NLTK data to the single location it should be in
nltk.data.path = [settings.NLTK_DIR]
try:
# Preload the corpus early, to force the lazy loader to transform
stopwords.ensure_loaded()
# Do some one time setup
# Sometimes, somehow, there's multiple threads loading the corpus
# and it's not thread safe, raising an AttributeError
self._stemmer = SnowballStemmer(settings.NLTK_LANGUAGE)
self._stop_words = frozenset(stopwords.words(settings.NLTK_LANGUAGE))
except AttributeError:
logger.debug("Could not initialize NLTK for advanced text processing.")
return False
return True
def stem_and_skip_stop_words(self, words: list[str], *, shared_cache=True):
"""
Reduce a list of words to their stem. Stop words are converted to empty strings.
:param words: the list of words to stem
"""
def _stem_and_skip_stop_word(word: str):
"""
Reduce a given word to its stem. If it's a stop word, return an empty string.
E.g. "amazement", "amaze" and "amazed" all return "amaz".
"""
cached = self._stem_cache.get(word)
if cached is not None:
return cached
elif word in self._stop_words:
return ""
# Assumption: words that contain numbers are never stemmed
elif RE_DIGIT.search(word):
return word
else:
result = self._stemmer.stem(word)
self._stem_cache.set(word, result)
return result
if shared_cache:
self._stem_cache.load()
# Stem the words and skip stop words
result = " ".join(
filter(None, (_stem_and_skip_stop_word(w) for w in words)),
)
if shared_cache:
self._stem_cache.save()
return result
def preprocess_content(
self,
content: str,
*,
shared_cache=True,
) -> str:
"""
Process the contents of a document, distilling it down into
words which are meaningful to the content.
A stemmer cache is shared across workers with the parameter "shared_cache".
This is unnecessary when training the classifier.
"""
# Lower case the document, reduce space,
# and keep only letters and digits.
content = " ".join(match.group().lower() for match in RE_WORD.finditer(content))
if ADVANCED_TEXT_PROCESSING_ENABLED:
from nltk.tokenize import word_tokenize
if not self._init_advanced_text_processing():
return content
# Tokenize
# This splits the content into tokens, roughly words
words = word_tokenize(content, language=settings.NLTK_LANGUAGE)
# Stem the words and skip stop words
content = self.stem_and_skip_stop_words(words, shared_cache=shared_cache)
return content
def _get_vectorizer_cache_key(self, content: str):
hash = sha256(content.encode())
hash.update(
f"|{self.FORMAT_VERSION}|{settings.NLTK_LANGUAGE}|{settings.NLTK_ENABLED}|{self.data_vectorizer_hash}".encode(),
)
return f"vectorized_content_{hash.hexdigest()}"
def _vectorize(self, content: str):
key = self._get_vectorizer_cache_key(content)
serialized_result = read_cache.get(key)
if serialized_result is None:
result = self.data_vectorizer.transform([self.preprocess_content(content)])
read_cache.set(key, pickle.dumps(result), CACHE_5_MINUTES)
else:
read_cache.touch(key, CACHE_5_MINUTES)
result = pickle.loads(serialized_result)
return result
def predict_correspondent(self, content: str) -> int | None:
if self.correspondent_classifier:
X = self._vectorize(content)
correspondent_id = self.correspondent_classifier.predict(X)
if correspondent_id != -1:
return correspondent_id
else:
return None
else:
return None
def predict_document_type(self, content: str) -> int | None:
if self.document_type_classifier:
X = self._vectorize(content)
document_type_id = self.document_type_classifier.predict(X)
if document_type_id != -1:
return document_type_id
else:
return None
else:
return None
def predict_tags(self, content: str) -> list[int]:
from sklearn.utils.multiclass import type_of_target
if self.tags_classifier:
X = self._vectorize(content)
y = self.tags_classifier.predict(X)
tags_ids = self.tags_binarizer.inverse_transform(y)[0]
if type_of_target(y).startswith("multilabel"):
# the usual case when there are multiple tags.
return list(tags_ids)
elif type_of_target(y) == "binary" and tags_ids != -1:
# This is for when we have binary classification with only one
# tag and the result is to assign this tag.
return [tags_ids]
else:
# Usually binary as well with -1 as the result, but we're
# going to catch everything else here as well.
return []
else:
return []
def predict_storage_path(self, content: str) -> int | None:
if self.storage_path_classifier:
X = self._vectorize(content)
storage_path_id = self.storage_path_classifier.predict(X)
if storage_path_id != -1:
return storage_path_id
else:
return None
else:
return None

View File

@@ -0,0 +1,149 @@
from datetime import datetime
from datetime import timezone
from django.conf import settings
from django.core.cache import cache
from documents.caching import CACHE_5_MINUTES
from documents.caching import CACHE_50_MINUTES
from documents.caching import CLASSIFIER_HASH_KEY
from documents.caching import CLASSIFIER_MODIFIED_KEY
from documents.caching import CLASSIFIER_VERSION_KEY
from documents.caching import get_thumbnail_modified_key
from documents.classifier import DocumentClassifier
from documents.models import Document
def suggestions_etag(request, pk: int) -> str | None:
"""
Returns an optional string for the ETag, allowing browser caching of
suggestions if the classifier has not been changed and the suggested dates
setting is also unchanged
"""
# If no model file, no etag at all
if not settings.MODEL_FILE.exists():
return None
# Check cache information
cache_hits = cache.get_many(
[CLASSIFIER_VERSION_KEY, CLASSIFIER_HASH_KEY],
)
# If the version differs somehow, no etag
if (
CLASSIFIER_VERSION_KEY in cache_hits
and cache_hits[CLASSIFIER_VERSION_KEY] != DocumentClassifier.FORMAT_VERSION
):
return None
elif CLASSIFIER_HASH_KEY in cache_hits:
# Refresh the cache and return the hash digest and the dates setting
cache.touch(CLASSIFIER_HASH_KEY, CACHE_5_MINUTES)
return f"{cache_hits[CLASSIFIER_HASH_KEY]}:{settings.NUMBER_OF_SUGGESTED_DATES}"
return None
def suggestions_last_modified(request, pk: int) -> datetime | None:
"""
Returns the datetime of classifier last modification. This is slightly off,
as there is not way to track the suggested date setting modification, but it seems
unlikely that changes too often
"""
# No file, no last modified
if not settings.MODEL_FILE.exists():
return None
cache_hits = cache.get_many(
[CLASSIFIER_VERSION_KEY, CLASSIFIER_MODIFIED_KEY],
)
# If the version differs somehow, no last modified
if (
CLASSIFIER_VERSION_KEY in cache_hits
and cache_hits[CLASSIFIER_VERSION_KEY] != DocumentClassifier.FORMAT_VERSION
):
return None
elif CLASSIFIER_MODIFIED_KEY in cache_hits:
# Refresh the cache and return the last modified
cache.touch(CLASSIFIER_MODIFIED_KEY, CACHE_5_MINUTES)
return cache_hits[CLASSIFIER_MODIFIED_KEY]
return None
def metadata_etag(request, pk: int) -> str | None:
"""
Metadata is extracted from the original file, so use its checksum as the
ETag
"""
try:
doc = Document.objects.only("checksum").get(pk=pk)
return doc.checksum
except Document.DoesNotExist: # pragma: no cover
return None
return None
def metadata_last_modified(request, pk: int) -> datetime | None:
"""
Metadata is extracted from the original file, so use its modified. Strictly speaking, this is
not the modification of the original file, but of the database object, but might as well
error on the side of more cautious
"""
try:
doc = Document.objects.only("modified").get(pk=pk)
return doc.modified
except Document.DoesNotExist: # pragma: no cover
return None
return None
def preview_etag(request, pk: int) -> str | None:
"""
ETag for the document preview, using the original or archive checksum, depending on the request
"""
try:
doc = Document.objects.only("checksum", "archive_checksum").get(pk=pk)
use_original = (
"original" in request.query_params
and request.query_params["original"] == "true"
)
return doc.checksum if use_original else doc.archive_checksum
except Document.DoesNotExist: # pragma: no cover
return None
return None
def preview_last_modified(request, pk: int) -> datetime | None:
"""
Uses the documents modified time to set the Last-Modified header. Not strictly
speaking correct, but close enough and quick
"""
try:
doc = Document.objects.only("modified").get(pk=pk)
return doc.modified
except Document.DoesNotExist: # pragma: no cover
return None
return None
def thumbnail_last_modified(request, pk: int) -> datetime | None:
"""
Returns the filesystem last modified either from cache or from filesystem.
Cache should be (slightly?) faster than filesystem
"""
try:
doc = Document.objects.only("storage_type").get(pk=pk)
if not doc.thumbnail_path.exists():
return None
doc_key = get_thumbnail_modified_key(pk)
cache_hit = cache.get(doc_key)
if cache_hit is not None:
cache.touch(doc_key, CACHE_50_MINUTES)
return cache_hit
# No cache, get the timestamp and cache the datetime
last_modified = datetime.fromtimestamp(
doc.thumbnail_path.stat().st_mtime,
tz=timezone.utc,
)
cache.set(doc_key, last_modified, CACHE_50_MINUTES)
return last_modified
except Document.DoesNotExist: # pragma: no cover
return None

860
src/documents/consumer.py Normal file
View File

@@ -0,0 +1,860 @@
import datetime
import hashlib
import os
import tempfile
from enum import Enum
from pathlib import Path
from typing import TYPE_CHECKING
import magic
from django.conf import settings
from django.contrib.auth.models import User
from django.db import transaction
from django.db.models import Q
from django.utils import timezone
from filelock import FileLock
from rest_framework.reverse import reverse
from documents.classifier import load_classifier
from documents.data_models import ConsumableDocument
from documents.data_models import DocumentMetadataOverrides
from documents.file_handling import create_source_path_directory
from documents.file_handling import generate_unique_filename
from documents.loggers import LoggingMixin
from documents.models import Correspondent
from documents.models import CustomField
from documents.models import CustomFieldInstance
from documents.models import Document
from documents.models import DocumentType
from documents.models import StoragePath
from documents.models import Tag
from documents.models import WorkflowTrigger
from documents.parsers import DocumentParser
from documents.parsers import ParseError
from documents.parsers import get_parser_class_for_mime_type
from documents.parsers import parse_date
from documents.permissions import set_permissions_for_object
from documents.plugins.base import AlwaysRunPluginMixin
from documents.plugins.base import ConsumeTaskPlugin
from documents.plugins.base import NoCleanupPluginMixin
from documents.plugins.base import NoSetupPluginMixin
from documents.plugins.helpers import ProgressManager
from documents.plugins.helpers import ProgressStatusOptions
from documents.signals import document_consumption_finished
from documents.signals import document_consumption_started
from documents.signals.handlers import run_workflows
from documents.templating.workflows import parse_w_workflow_placeholders
from documents.utils import copy_basic_file_stats
from documents.utils import copy_file_with_basic_stats
from documents.utils import run_subprocess
from paperless_mail.parsers import MailDocumentParser
class WorkflowTriggerPlugin(
NoCleanupPluginMixin,
NoSetupPluginMixin,
AlwaysRunPluginMixin,
ConsumeTaskPlugin,
):
NAME: str = "WorkflowTriggerPlugin"
def run(self) -> str | None:
"""
Get overrides from matching workflows
"""
overrides, msg = run_workflows(
trigger_type=WorkflowTrigger.WorkflowTriggerType.CONSUMPTION,
document=self.input_doc,
logging_group=None,
overrides=DocumentMetadataOverrides(),
)
if overrides:
self.metadata.update(overrides)
return msg
class ConsumerError(Exception):
pass
class ConsumerStatusShortMessage(str, Enum):
DOCUMENT_ALREADY_EXISTS = "document_already_exists"
DOCUMENT_ALREADY_EXISTS_IN_TRASH = "document_already_exists_in_trash"
ASN_ALREADY_EXISTS = "asn_already_exists"
ASN_ALREADY_EXISTS_IN_TRASH = "asn_already_exists_in_trash"
ASN_RANGE = "asn_value_out_of_range"
FILE_NOT_FOUND = "file_not_found"
PRE_CONSUME_SCRIPT_NOT_FOUND = "pre_consume_script_not_found"
PRE_CONSUME_SCRIPT_ERROR = "pre_consume_script_error"
POST_CONSUME_SCRIPT_NOT_FOUND = "post_consume_script_not_found"
POST_CONSUME_SCRIPT_ERROR = "post_consume_script_error"
NEW_FILE = "new_file"
UNSUPPORTED_TYPE = "unsupported_type"
PARSING_DOCUMENT = "parsing_document"
GENERATING_THUMBNAIL = "generating_thumbnail"
PARSE_DATE = "parse_date"
SAVE_DOCUMENT = "save_document"
FINISHED = "finished"
FAILED = "failed"
class ConsumerPluginMixin:
def __init__(
self,
input_doc: ConsumableDocument,
metadata: DocumentMetadataOverrides,
status_mgr: ProgressManager,
base_tmp_dir: Path,
task_id: str,
) -> None:
super().__init__(input_doc, metadata, status_mgr, base_tmp_dir, task_id)
self.renew_logging_group()
self.filename = self.metadata.filename or self.input_doc.original_file.name
def _send_progress(
self,
current_progress: int,
max_progress: int,
status: ProgressStatusOptions,
message: ConsumerStatusShortMessage | str | None = None,
document_id=None,
): # pragma: no cover
self.status_mgr.send_progress(
status,
message,
current_progress,
max_progress,
extra_args={
"document_id": document_id,
"owner_id": self.metadata.owner_id if self.metadata.owner_id else None,
"users_can_view": (self.metadata.view_users or [])
+ (self.metadata.change_users or []),
"groups_can_view": (self.metadata.view_groups or [])
+ (self.metadata.change_groups or []),
},
)
def _fail(
self,
message: ConsumerStatusShortMessage | str,
log_message: str | None = None,
exc_info=None,
exception: Exception | None = None,
):
self._send_progress(100, 100, ProgressStatusOptions.FAILED, message)
self.log.error(log_message or message, exc_info=exc_info)
raise ConsumerError(f"{self.filename}: {log_message or message}") from exception
class ConsumerPlugin(
AlwaysRunPluginMixin,
NoSetupPluginMixin,
NoCleanupPluginMixin,
LoggingMixin,
ConsumerPluginMixin,
ConsumeTaskPlugin,
):
logging_name = "paperless.consumer"
def run_pre_consume_script(self):
"""
If one is configured and exists, run the pre-consume script and
handle its output and/or errors
"""
if not settings.PRE_CONSUME_SCRIPT:
return
if not Path(settings.PRE_CONSUME_SCRIPT).is_file():
self._fail(
ConsumerStatusShortMessage.PRE_CONSUME_SCRIPT_NOT_FOUND,
f"Configured pre-consume script "
f"{settings.PRE_CONSUME_SCRIPT} does not exist.",
)
self.log.info(f"Executing pre-consume script {settings.PRE_CONSUME_SCRIPT}")
working_file_path = str(self.working_copy)
original_file_path = str(self.input_doc.original_file)
script_env = os.environ.copy()
script_env["DOCUMENT_SOURCE_PATH"] = original_file_path
script_env["DOCUMENT_WORKING_PATH"] = working_file_path
script_env["TASK_ID"] = self.task_id or ""
try:
run_subprocess(
[
settings.PRE_CONSUME_SCRIPT,
original_file_path,
],
script_env,
self.log,
)
except Exception as e:
self._fail(
ConsumerStatusShortMessage.PRE_CONSUME_SCRIPT_ERROR,
f"Error while executing pre-consume script: {e}",
exc_info=True,
exception=e,
)
def run_post_consume_script(self, document: Document):
"""
If one is configured and exists, run the pre-consume script and
handle its output and/or errors
"""
if not settings.POST_CONSUME_SCRIPT:
return
if not Path(settings.POST_CONSUME_SCRIPT).is_file():
self._fail(
ConsumerStatusShortMessage.POST_CONSUME_SCRIPT_NOT_FOUND,
f"Configured post-consume script "
f"{settings.POST_CONSUME_SCRIPT} does not exist.",
)
self.log.info(
f"Executing post-consume script {settings.POST_CONSUME_SCRIPT}",
)
script_env = os.environ.copy()
script_env["DOCUMENT_ID"] = str(document.pk)
script_env["DOCUMENT_TYPE"] = str(document.document_type)
script_env["DOCUMENT_CREATED"] = str(document.created)
script_env["DOCUMENT_MODIFIED"] = str(document.modified)
script_env["DOCUMENT_ADDED"] = str(document.added)
script_env["DOCUMENT_FILE_NAME"] = document.get_public_filename()
script_env["DOCUMENT_SOURCE_PATH"] = os.path.normpath(document.source_path)
script_env["DOCUMENT_ARCHIVE_PATH"] = os.path.normpath(
str(document.archive_path),
)
script_env["DOCUMENT_THUMBNAIL_PATH"] = os.path.normpath(
document.thumbnail_path,
)
script_env["DOCUMENT_DOWNLOAD_URL"] = reverse(
"document-download",
kwargs={"pk": document.pk},
)
script_env["DOCUMENT_THUMBNAIL_URL"] = reverse(
"document-thumb",
kwargs={"pk": document.pk},
)
script_env["DOCUMENT_OWNER"] = (
document.owner.get_username() if document.owner else ""
)
script_env["DOCUMENT_CORRESPONDENT"] = str(document.correspondent)
script_env["DOCUMENT_TAGS"] = str(
",".join(document.tags.all().values_list("name", flat=True)),
)
script_env["DOCUMENT_ORIGINAL_FILENAME"] = str(document.original_filename)
script_env["TASK_ID"] = self.task_id or ""
try:
run_subprocess(
[
settings.POST_CONSUME_SCRIPT,
str(document.pk),
document.get_public_filename(),
os.path.normpath(document.source_path),
os.path.normpath(document.thumbnail_path),
reverse("document-download", kwargs={"pk": document.pk}),
reverse("document-thumb", kwargs={"pk": document.pk}),
str(document.correspondent),
str(",".join(document.tags.all().values_list("name", flat=True))),
],
script_env,
self.log,
)
except Exception as e:
self._fail(
ConsumerStatusShortMessage.POST_CONSUME_SCRIPT_ERROR,
f"Error while executing post-consume script: {e}",
exc_info=True,
exception=e,
)
def run(self) -> str:
"""
Return the document object if it was successfully created.
"""
tempdir = None
try:
# Preflight has already run including progress update to 0%
self.log.info(f"Consuming {self.filename}")
# For the actual work, copy the file into a tempdir
tempdir = tempfile.TemporaryDirectory(
prefix="paperless-ngx",
dir=settings.SCRATCH_DIR,
)
self.working_copy = Path(tempdir.name) / Path(self.filename)
copy_file_with_basic_stats(self.input_doc.original_file, self.working_copy)
self.unmodified_original = None
# Determine the parser class.
mime_type = magic.from_file(self.working_copy, mime=True)
self.log.debug(f"Detected mime type: {mime_type}")
if (
Path(self.filename).suffix.lower() == ".pdf"
and mime_type in settings.CONSUMER_PDF_RECOVERABLE_MIME_TYPES
):
try:
# The file might be a pdf, but the mime type is wrong.
# Try to clean with qpdf
self.log.debug(
"Detected possible PDF with wrong mime type, trying to clean with qpdf",
)
run_subprocess(
[
"qpdf",
"--replace-input",
self.working_copy,
],
logger=self.log,
)
mime_type = magic.from_file(self.working_copy, mime=True)
self.log.debug(f"Detected mime type after qpdf: {mime_type}")
# Save the original file for later
self.unmodified_original = (
Path(tempdir.name) / Path("uo") / Path(self.filename)
)
self.unmodified_original.parent.mkdir(exist_ok=True)
copy_file_with_basic_stats(
self.input_doc.original_file,
self.unmodified_original,
)
except Exception as e:
self.log.error(f"Error attempting to clean PDF: {e}")
# Based on the mime type, get the parser for that type
parser_class: type[DocumentParser] | None = get_parser_class_for_mime_type(
mime_type,
)
if not parser_class:
tempdir.cleanup()
self._fail(
ConsumerStatusShortMessage.UNSUPPORTED_TYPE,
f"Unsupported mime type {mime_type}",
)
# Notify all listeners that we're going to do some work.
document_consumption_started.send(
sender=self.__class__,
filename=self.working_copy,
logging_group=self.logging_group,
)
self.run_pre_consume_script()
except:
if tempdir:
tempdir.cleanup()
raise
def progress_callback(current_progress, max_progress): # pragma: no cover
# recalculate progress to be within 20 and 80
p = int((current_progress / max_progress) * 50 + 20)
self._send_progress(p, 100, ProgressStatusOptions.WORKING)
# This doesn't parse the document yet, but gives us a parser.
document_parser: DocumentParser = parser_class(
self.logging_group,
progress_callback=progress_callback,
)
self.log.debug(f"Parser: {type(document_parser).__name__}")
# Parse the document. This may take some time.
text = None
date = None
thumbnail = None
archive_path = None
page_count = None
try:
self._send_progress(
20,
100,
ProgressStatusOptions.WORKING,
ConsumerStatusShortMessage.PARSING_DOCUMENT,
)
self.log.debug(f"Parsing {self.filename}...")
if (
isinstance(document_parser, MailDocumentParser)
and self.input_doc.mailrule_id
):
document_parser.parse(
self.working_copy,
mime_type,
self.filename,
self.input_doc.mailrule_id,
)
else:
document_parser.parse(self.working_copy, mime_type, self.filename)
self.log.debug(f"Generating thumbnail for {self.filename}...")
self._send_progress(
70,
100,
ProgressStatusOptions.WORKING,
ConsumerStatusShortMessage.GENERATING_THUMBNAIL,
)
thumbnail = document_parser.get_thumbnail(
self.working_copy,
mime_type,
self.filename,
)
text = document_parser.get_text()
date = document_parser.get_date()
if date is None:
self._send_progress(
90,
100,
ProgressStatusOptions.WORKING,
ConsumerStatusShortMessage.PARSE_DATE,
)
date = parse_date(self.filename, text)
archive_path = document_parser.get_archive_path()
page_count = document_parser.get_page_count(self.working_copy, mime_type)
except ParseError as e:
document_parser.cleanup()
if tempdir:
tempdir.cleanup()
self._fail(
str(e),
f"Error occurred while consuming document {self.filename}: {e}",
exc_info=True,
exception=e,
)
except Exception as e:
document_parser.cleanup()
if tempdir:
tempdir.cleanup()
self._fail(
str(e),
f"Unexpected error while consuming document {self.filename}: {e}",
exc_info=True,
exception=e,
)
# Prepare the document classifier.
# TODO: I don't really like to do this here, but this way we avoid
# reloading the classifier multiple times, since there are multiple
# post-consume hooks that all require the classifier.
classifier = load_classifier()
self._send_progress(
95,
100,
ProgressStatusOptions.WORKING,
ConsumerStatusShortMessage.SAVE_DOCUMENT,
)
# now that everything is done, we can start to store the document
# in the system. This will be a transaction and reasonably fast.
try:
with transaction.atomic():
# store the document.
document = self._store(
text=text,
date=date,
page_count=page_count,
mime_type=mime_type,
)
# If we get here, it was successful. Proceed with post-consume
# hooks. If they fail, nothing will get changed.
document_consumption_finished.send(
sender=self.__class__,
document=document,
logging_group=self.logging_group,
classifier=classifier,
original_file=self.unmodified_original
if self.unmodified_original
else self.working_copy,
)
# After everything is in the database, copy the files into
# place. If this fails, we'll also rollback the transaction.
with FileLock(settings.MEDIA_LOCK):
document.filename = generate_unique_filename(document)
create_source_path_directory(document.source_path)
self._write(
document.storage_type,
self.unmodified_original
if self.unmodified_original is not None
else self.working_copy,
document.source_path,
)
self._write(
document.storage_type,
thumbnail,
document.thumbnail_path,
)
if archive_path and Path(archive_path).is_file():
document.archive_filename = generate_unique_filename(
document,
archive_filename=True,
)
create_source_path_directory(document.archive_path)
self._write(
document.storage_type,
archive_path,
document.archive_path,
)
with Path(archive_path).open("rb") as f:
document.archive_checksum = hashlib.md5(
f.read(),
).hexdigest()
# Don't save with the lock active. Saving will cause the file
# renaming logic to acquire the lock as well.
# This triggers things like file renaming
document.save()
# Delete the file only if it was successfully consumed
self.log.debug(f"Deleting original file {self.input_doc.original_file}")
self.input_doc.original_file.unlink()
self.log.debug(f"Deleting working copy {self.working_copy}")
self.working_copy.unlink()
if self.unmodified_original is not None: # pragma: no cover
self.log.debug(
f"Deleting unmodified original file {self.unmodified_original}",
)
self.unmodified_original.unlink()
# https://github.com/jonaswinkler/paperless-ng/discussions/1037
shadow_file = (
Path(self.input_doc.original_file).parent
/ f"._{Path(self.input_doc.original_file).name}"
)
if Path(shadow_file).is_file():
self.log.debug(f"Deleting shadow file {shadow_file}")
Path(shadow_file).unlink()
except Exception as e:
self._fail(
str(e),
f"The following error occurred while storing document "
f"{self.filename} after parsing: {e}",
exc_info=True,
exception=e,
)
finally:
document_parser.cleanup()
tempdir.cleanup()
self.run_post_consume_script(document)
self.log.info(f"Document {document} consumption finished")
self._send_progress(
100,
100,
ProgressStatusOptions.SUCCESS,
ConsumerStatusShortMessage.FINISHED,
document.id,
)
# Return the most up to date fields
document.refresh_from_db()
return f"Success. New document id {document.pk} created"
def _parse_title_placeholders(self, title: str) -> str:
local_added = timezone.localtime(timezone.now())
correspondent_name = (
Correspondent.objects.get(pk=self.metadata.correspondent_id).name
if self.metadata.correspondent_id is not None
else None
)
doc_type_name = (
DocumentType.objects.get(pk=self.metadata.document_type_id).name
if self.metadata.document_type_id is not None
else None
)
owner_username = (
User.objects.get(pk=self.metadata.owner_id).username
if self.metadata.owner_id is not None
else None
)
return parse_w_workflow_placeholders(
title,
correspondent_name,
doc_type_name,
owner_username,
local_added,
self.filename,
self.filename,
)
def _store(
self,
text: str,
date: datetime.datetime | None,
page_count: int | None,
mime_type: str,
) -> Document:
# If someone gave us the original filename, use it instead of doc.
self.log.debug("Saving record to database")
if self.metadata.created is not None:
create_date = self.metadata.created
self.log.debug(
f"Creation date from post_documents parameter: {create_date}",
)
elif date is not None:
create_date = date
self.log.debug(f"Creation date from parse_date: {create_date}")
else:
stats = Path(self.input_doc.original_file).stat()
create_date = timezone.make_aware(
datetime.datetime.fromtimestamp(stats.st_mtime),
)
self.log.debug(f"Creation date from st_mtime: {create_date}")
storage_type = Document.STORAGE_TYPE_UNENCRYPTED
if self.metadata.filename:
title = Path(self.metadata.filename).stem
else:
title = self.input_doc.original_file.stem
if self.metadata.title is not None:
try:
title = self._parse_title_placeholders(self.metadata.title)
except Exception as e:
self.log.error(
f"Error occurred parsing title override '{self.metadata.title}', falling back to original. Exception: {e}",
)
file_for_checksum = (
self.unmodified_original
if self.unmodified_original is not None
else self.working_copy
)
document = Document.objects.create(
title=title[:127],
content=text,
mime_type=mime_type,
checksum=hashlib.md5(file_for_checksum.read_bytes()).hexdigest(),
created=create_date,
modified=create_date,
storage_type=storage_type,
page_count=page_count,
original_filename=self.filename,
)
self.apply_overrides(document)
document.save()
return document
def apply_overrides(self, document):
if self.metadata.correspondent_id:
document.correspondent = Correspondent.objects.get(
pk=self.metadata.correspondent_id,
)
if self.metadata.document_type_id:
document.document_type = DocumentType.objects.get(
pk=self.metadata.document_type_id,
)
if self.metadata.tag_ids:
for tag_id in self.metadata.tag_ids:
document.add_nested_tags([Tag.objects.get(pk=tag_id)])
if self.metadata.storage_path_id:
document.storage_path = StoragePath.objects.get(
pk=self.metadata.storage_path_id,
)
if self.metadata.asn is not None:
document.archive_serial_number = self.metadata.asn
if self.metadata.owner_id:
document.owner = User.objects.get(
pk=self.metadata.owner_id,
)
if (
self.metadata.view_users is not None
or self.metadata.view_groups is not None
or self.metadata.change_users is not None
or self.metadata.change_users is not None
):
permissions = {
"view": {
"users": self.metadata.view_users or [],
"groups": self.metadata.view_groups or [],
},
"change": {
"users": self.metadata.change_users or [],
"groups": self.metadata.change_groups or [],
},
}
set_permissions_for_object(permissions=permissions, object=document)
if self.metadata.custom_fields:
for field in CustomField.objects.filter(
id__in=self.metadata.custom_fields.keys(),
).distinct():
value_field_name = CustomFieldInstance.get_value_field_name(
data_type=field.data_type,
)
args = {
"field": field,
"document": document,
value_field_name: self.metadata.custom_fields.get(field.id, None),
}
CustomFieldInstance.objects.create(**args) # adds to document
def _write(self, storage_type, source, target):
with (
Path(source).open("rb") as read_file,
Path(target).open("wb") as write_file,
):
write_file.write(read_file.read())
# Attempt to copy file's original stats, but it's ok if we can't
try:
copy_basic_file_stats(source, target)
except Exception: # pragma: no cover
pass
class ConsumerPreflightPlugin(
NoCleanupPluginMixin,
NoSetupPluginMixin,
AlwaysRunPluginMixin,
LoggingMixin,
ConsumerPluginMixin,
ConsumeTaskPlugin,
):
NAME: str = "ConsumerPreflightPlugin"
logging_name = "paperless.consumer"
def pre_check_file_exists(self):
"""
Confirm the input file still exists where it should
"""
if TYPE_CHECKING:
assert isinstance(self.input_doc.original_file, Path), (
self.input_doc.original_file
)
if not self.input_doc.original_file.is_file():
self._fail(
ConsumerStatusShortMessage.FILE_NOT_FOUND,
f"Cannot consume {self.input_doc.original_file}: File not found.",
)
def pre_check_duplicate(self):
"""
Using the MD5 of the file, check this exact file doesn't already exist
"""
with Path(self.input_doc.original_file).open("rb") as f:
checksum = hashlib.md5(f.read()).hexdigest()
existing_doc = Document.global_objects.filter(
Q(checksum=checksum) | Q(archive_checksum=checksum),
)
if existing_doc.exists():
msg = ConsumerStatusShortMessage.DOCUMENT_ALREADY_EXISTS
log_msg = f"Not consuming {self.filename}: It is a duplicate of {existing_doc.get().title} (#{existing_doc.get().pk})."
if existing_doc.first().deleted_at is not None:
msg = ConsumerStatusShortMessage.DOCUMENT_ALREADY_EXISTS_IN_TRASH
log_msg += " Note: existing document is in the trash."
if settings.CONSUMER_DELETE_DUPLICATES:
Path(self.input_doc.original_file).unlink()
self._fail(
msg,
log_msg,
)
def pre_check_directories(self):
"""
Ensure all required directories exist before attempting to use them
"""
settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True)
settings.THUMBNAIL_DIR.mkdir(parents=True, exist_ok=True)
settings.ORIGINALS_DIR.mkdir(parents=True, exist_ok=True)
settings.ARCHIVE_DIR.mkdir(parents=True, exist_ok=True)
def pre_check_asn_value(self):
"""
Check that if override_asn is given, it is unique and within a valid range
"""
if self.metadata.asn is None:
# check not necessary in case no ASN gets set
return
# Validate the range is above zero and less than uint32_t max
# otherwise, Whoosh can't handle it in the index
if (
self.metadata.asn < Document.ARCHIVE_SERIAL_NUMBER_MIN
or self.metadata.asn > Document.ARCHIVE_SERIAL_NUMBER_MAX
):
self._fail(
ConsumerStatusShortMessage.ASN_RANGE,
f"Not consuming {self.filename}: "
f"Given ASN {self.metadata.asn} is out of range "
f"[{Document.ARCHIVE_SERIAL_NUMBER_MIN:,}, "
f"{Document.ARCHIVE_SERIAL_NUMBER_MAX:,}]",
)
existing_asn_doc = Document.global_objects.filter(
archive_serial_number=self.metadata.asn,
)
if existing_asn_doc.exists():
msg = ConsumerStatusShortMessage.ASN_ALREADY_EXISTS
log_msg = f"Not consuming {self.filename}: Given ASN {self.metadata.asn} already exists!"
if existing_asn_doc.first().deleted_at is not None:
msg = ConsumerStatusShortMessage.ASN_ALREADY_EXISTS_IN_TRASH
log_msg += " Note: existing document is in the trash."
self._fail(
msg,
log_msg,
)
def run(self) -> None:
self._send_progress(
0,
100,
ProgressStatusOptions.STARTED,
ConsumerStatusShortMessage.NEW_FILE,
)
# Make sure that preconditions for consuming the file are met.
self.pre_check_file_exists()
self.pre_check_duplicate()
self.pre_check_directories()
self.pre_check_asn_value()

View File

@@ -0,0 +1,35 @@
from django.conf import settings as django_settings
from django.contrib.auth.models import User
from documents.models import Document
from paperless.config import GeneralConfig
def settings(request):
general_config = GeneralConfig()
app_title = (
django_settings.APP_TITLE
if general_config.app_title is None or len(general_config.app_title) == 0
else general_config.app_title
)
app_logo = (
django_settings.APP_LOGO
if general_config.app_logo is None or len(general_config.app_logo) == 0
else django_settings.BASE_URL + general_config.app_logo.lstrip("/")
)
return {
"EMAIL_ENABLED": django_settings.EMAIL_ENABLED,
"DISABLE_REGULAR_LOGIN": django_settings.DISABLE_REGULAR_LOGIN,
"REDIRECT_LOGIN_TO_SSO": django_settings.REDIRECT_LOGIN_TO_SSO,
"ACCOUNT_ALLOW_SIGNUPS": django_settings.ACCOUNT_ALLOW_SIGNUPS,
"domain": getattr(django_settings, "PAPERLESS_URL", request.get_host()),
"APP_TITLE": app_title,
"APP_LOGO": app_logo,
"FIRST_INSTALL": User.objects.exclude(
username__in=["consumer", "AnonymousUser"],
).count()
== 0
and Document.global_objects.count() == 0,
}

View File

@@ -0,0 +1,50 @@
from pathlib import Path
import img2pdf
from django.conf import settings
from PIL import Image
from documents.utils import copy_basic_file_stats
from documents.utils import maybe_override_pixel_limit
from documents.utils import run_subprocess
def convert_from_tiff_to_pdf(tiff_path: Path, target_directory: Path) -> Path:
"""
Converts a TIFF file into a PDF file.
The PDF will be created in the given target_directory and share the name of
the original TIFF file, as well as its stats (mtime etc.).
Returns the path of the PDF created.
"""
# override pixel setting if needed
maybe_override_pixel_limit()
with Image.open(tiff_path) as im:
has_alpha_layer = im.mode in ("RGBA", "LA")
if has_alpha_layer:
# Note the save into the temp folder, so as not to trigger a new
# consume
scratch_image = target_directory / tiff_path.name
run_subprocess(
[
settings.CONVERT_BINARY,
"-alpha",
"off",
tiff_path,
scratch_image,
],
)
else:
# Not modifying the original, safe to use in place
scratch_image = tiff_path
pdf_path = (target_directory / tiff_path.name).with_suffix(".pdf")
with scratch_image.open("rb") as img_file, pdf_path.open("wb") as pdf_file:
pdf_file.write(img2pdf.convert(img_file))
# Copy what file stat is possible
copy_basic_file_stats(tiff_path, pdf_path)
return pdf_path

View File

@@ -0,0 +1,175 @@
import dataclasses
import datetime
from enum import IntEnum
from pathlib import Path
import magic
from guardian.shortcuts import get_groups_with_perms
from guardian.shortcuts import get_users_with_perms
@dataclasses.dataclass
class DocumentMetadataOverrides:
"""
Manages overrides for document fields which normally would
be set from content or matching. All fields default to None,
meaning no override is happening
"""
filename: str | None = None
title: str | None = None
correspondent_id: int | None = None
document_type_id: int | None = None
tag_ids: list[int] | None = None
storage_path_id: int | None = None
created: datetime.datetime | None = None
asn: int | None = None
owner_id: int | None = None
view_users: list[int] | None = None
view_groups: list[int] | None = None
change_users: list[int] | None = None
change_groups: list[int] | None = None
custom_fields: dict | None = None
def update(self, other: "DocumentMetadataOverrides") -> "DocumentMetadataOverrides":
"""
Merges two DocumentMetadataOverrides objects such that object B's overrides
are applied to object A or merged if multiple are accepted.
The update is an in-place modification of self
"""
# only if empty
if other.title is not None:
self.title = other.title
if other.correspondent_id is not None:
self.correspondent_id = other.correspondent_id
if other.document_type_id is not None:
self.document_type_id = other.document_type_id
if other.storage_path_id is not None:
self.storage_path_id = other.storage_path_id
if other.owner_id is not None:
self.owner_id = other.owner_id
# merge
if self.tag_ids is None:
self.tag_ids = other.tag_ids
elif other.tag_ids is not None:
self.tag_ids.extend(other.tag_ids)
self.tag_ids = list(set(self.tag_ids))
if self.view_users is None:
self.view_users = other.view_users
elif other.view_users is not None:
self.view_users.extend(other.view_users)
self.view_users = list(set(self.view_users))
if self.view_groups is None:
self.view_groups = other.view_groups
elif other.view_groups is not None:
self.view_groups.extend(other.view_groups)
self.view_groups = list(set(self.view_groups))
if self.change_users is None:
self.change_users = other.change_users
elif other.change_users is not None:
self.change_users.extend(other.change_users)
self.change_users = list(set(self.change_users))
if self.change_groups is None:
self.change_groups = other.change_groups
elif other.change_groups is not None:
self.change_groups.extend(other.change_groups)
self.change_groups = list(set(self.change_groups))
if self.custom_fields is None:
self.custom_fields = other.custom_fields
elif other.custom_fields is not None:
self.custom_fields.update(other.custom_fields)
return self
@staticmethod
def from_document(doc) -> "DocumentMetadataOverrides":
"""
Fills in the overrides from a document object
"""
overrides = DocumentMetadataOverrides()
overrides.title = doc.title
overrides.correspondent_id = doc.correspondent.id if doc.correspondent else None
overrides.document_type_id = doc.document_type.id if doc.document_type else None
overrides.storage_path_id = doc.storage_path.id if doc.storage_path else None
overrides.owner_id = doc.owner.id if doc.owner else None
overrides.tag_ids = list(doc.tags.values_list("id", flat=True))
overrides.view_users = list(
get_users_with_perms(
doc,
only_with_perms_in=["view_document"],
).values_list("id", flat=True),
)
overrides.change_users = list(
get_users_with_perms(
doc,
only_with_perms_in=["change_document"],
).values_list("id", flat=True),
)
overrides.custom_fields = {
custom_field.id: custom_field.value
for custom_field in doc.custom_fields.all()
}
groups_with_perms = get_groups_with_perms(
doc,
attach_perms=True,
)
overrides.view_groups = [
group.id
for group in groups_with_perms
if "view_document" in groups_with_perms[group]
]
overrides.change_groups = [
group.id
for group in groups_with_perms
if "change_document" in groups_with_perms[group]
]
return overrides
class DocumentSource(IntEnum):
"""
The source of an incoming document. May have other uses in the future
"""
ConsumeFolder = 1
ApiUpload = 2
MailFetch = 3
WebUI = 4
@dataclasses.dataclass
class ConsumableDocument:
"""
Encapsulates an incoming document, either from consume folder, API upload
or mail fetching and certain useful operations on it.
"""
source: DocumentSource
original_file: Path
original_path: Path | None = None
mailrule_id: int | None = None
mime_type: str = dataclasses.field(init=False, default=None)
def __post_init__(self):
"""
After a dataclass is initialized, this is called to finalize some data
1. Make sure the original path is an absolute, fully qualified path
2. Get the mime type of the file
"""
# Always fully qualify the path first thing
# Just in case, convert to a path if it's a str
self.original_file = Path(self.original_file).resolve()
# Get the file type once at init
# Note this function isn't called when the object is unpickled
self.mime_type = magic.from_file(self.original_file, mime=True)

View File

@@ -0,0 +1,146 @@
import datetime as dt
import logging
import os
import shutil
from pathlib import Path
from typing import Final
from django.conf import settings
from pikepdf import Pdf
from documents.consumer import ConsumerError
from documents.converters import convert_from_tiff_to_pdf
from documents.plugins.base import ConsumeTaskPlugin
from documents.plugins.base import NoCleanupPluginMixin
from documents.plugins.base import NoSetupPluginMixin
from documents.plugins.base import StopConsumeTaskError
logger = logging.getLogger("paperless.double_sided")
# Hardcoded for now, could be made a configurable setting if needed
TIMEOUT_MINUTES: Final[int] = 30
TIMEOUT_SECONDS: Final[int] = TIMEOUT_MINUTES * 60
# Used by test cases
STAGING_FILE_NAME = "double-sided-staging.pdf"
class CollatePlugin(NoCleanupPluginMixin, NoSetupPluginMixin, ConsumeTaskPlugin):
NAME: str = "CollatePlugin"
@property
def able_to_run(self) -> bool:
return (
settings.CONSUMER_ENABLE_COLLATE_DOUBLE_SIDED
and settings.CONSUMER_COLLATE_DOUBLE_SIDED_SUBDIR_NAME
in self.input_doc.original_file.parts
)
def run(self) -> str | None:
"""
Tries to collate pages from 2 single sided scans of a double sided
document.
When called with a file, it checks whether or not a staging file
exists, if not, the current file is turned into that staging file
containing the odd numbered pages.
If a staging file exists, and it is not too old, the current file is
considered to be the second part (the even numbered pages) and it will
collate the pages of both, the pages of the second file will be added
in reverse order, since the ADF will have scanned the pages from bottom
to top.
Returns a status message on success, or raises a ConsumerError
in case of failure.
"""
if self.input_doc.mime_type == "application/pdf":
pdf_file = self.input_doc.original_file
elif (
self.input_doc.mime_type == "image/tiff"
and settings.CONSUMER_COLLATE_DOUBLE_SIDED_TIFF_SUPPORT
):
pdf_file = convert_from_tiff_to_pdf(
self.input_doc.original_file,
self.base_tmp_dir,
)
self.input_doc.original_file.unlink()
else:
raise ConsumerError(
"Unsupported file type for collation of double-sided scans",
)
staging: Path = settings.SCRATCH_DIR / STAGING_FILE_NAME
valid_staging_exists = False
if staging.exists():
stats = staging.stat()
# if the file is older than the timeout, we don't consider
# it valid
if (dt.datetime.now().timestamp() - stats.st_mtime) > TIMEOUT_SECONDS:
logger.warning("Outdated double sided staging file exists, deleting it")
staging.unlink()
else:
valid_staging_exists = True
if valid_staging_exists:
try:
# Collate pages from second PDF in reverse order
with Pdf.open(staging) as pdf1, Pdf.open(pdf_file) as pdf2:
pdf2.pages.reverse()
try:
for i, page in enumerate(pdf2.pages):
pdf1.pages.insert(2 * i + 1, page)
except IndexError:
raise ConsumerError(
"This second file (even numbered pages) contains more "
"pages than the first/odd numbered one. This means the "
"two uploaded files don't belong to the same double-"
"sided scan. Please retry, starting with the odd "
"numbered pages again.",
)
# Merged file has the same path, but without the
# double-sided subdir. Therefore, it is also in the
# consumption dir and will be picked up for processing
old_file = self.input_doc.original_file
new_file = Path(
*(
part
for part in old_file.with_name(
f"{old_file.stem}-collated.pdf",
).parts
if part
!= settings.CONSUMER_COLLATE_DOUBLE_SIDED_SUBDIR_NAME
),
)
# If the user didn't create the subdirs yet, do it for them
new_file.parent.mkdir(parents=True, exist_ok=True)
pdf1.save(new_file)
logger.info("Collated documents into new file %s", new_file)
raise StopConsumeTaskError(
"Success. Even numbered pages of double sided scan collated "
"with odd pages",
)
finally:
# Delete staging and recently uploaded file no matter what.
# If any error occurs, the user needs to be able to restart
# the process from scratch; after all, the staging file
# with the odd numbered pages might be the culprit
pdf_file.unlink()
staging.unlink()
else:
shutil.move(pdf_file, staging)
# update access to modification time so we know if the file
# is outdated when another file gets uploaded
timestamp = dt.datetime.now().timestamp()
os.utime(staging, (timestamp, timestamp))
logger.info(
"Got scan with odd numbered pages of double-sided scan, moved it to %s",
staging,
)
raise StopConsumeTaskError(
"Received odd numbered pages of double sided scan, waiting up to "
f"{TIMEOUT_MINUTES} minutes for even numbered pages",
)

View File

@@ -0,0 +1,176 @@
import os
from pathlib import Path
from django.conf import settings
from documents.models import Document
from documents.templating.filepath import validate_filepath_template_and_render
from documents.templating.utils import convert_format_str_to_template_format
def create_source_path_directory(source_path: Path) -> None:
source_path.parent.mkdir(parents=True, exist_ok=True)
def delete_empty_directories(directory: Path, root: Path) -> None:
if not directory.is_dir():
return
if not directory.is_relative_to(root):
# don't do anything outside our originals folder.
# append os.path.set so that we avoid these cases:
# directory = /home/originals2/test
# root = /home/originals ("/" gets appended and startswith fails)
return
# Go up in the directory hierarchy and try to delete all directories
while directory != root:
if not list(directory.iterdir()):
# it's empty
try:
directory.rmdir()
except OSError:
# whatever. empty directories aren't that bad anyway.
return
else:
# it's not empty.
return
# go one level up
directory = directory.parent
def generate_unique_filename(doc, *, archive_filename=False) -> Path:
"""
Generates a unique filename for doc in settings.ORIGINALS_DIR.
The returned filename is guaranteed to be either the current filename
of the document if unchanged, or a new filename that does not correspondent
to any existing files. The function will append _01, _02, etc to the
filename before the extension to avoid conflicts.
If archive_filename is True, return a unique archive filename instead.
"""
if archive_filename:
old_filename: Path | None = (
Path(doc.archive_filename) if doc.archive_filename else None
)
root = settings.ARCHIVE_DIR
else:
old_filename = Path(doc.filename) if doc.filename else None
root = settings.ORIGINALS_DIR
# If generating archive filenames, try to make a name that is similar to
# the original filename first.
if archive_filename and doc.filename:
# Generate the full path using the same logic as generate_filename
base_generated = generate_filename(doc, archive_filename=archive_filename)
# Try to create a simple PDF version based on the original filename
# but preserve any directory structure from the template
if str(base_generated.parent) != ".":
# Has directory structure, preserve it
simple_pdf_name = base_generated.parent / (Path(doc.filename).stem + ".pdf")
else:
# No directory structure
simple_pdf_name = Path(Path(doc.filename).stem + ".pdf")
if simple_pdf_name == old_filename or not (root / simple_pdf_name).exists():
return simple_pdf_name
counter = 0
while True:
new_filename = generate_filename(
doc,
counter=counter,
archive_filename=archive_filename,
)
if new_filename == old_filename:
# still the same as before.
return new_filename
if (root / new_filename).exists():
counter += 1
else:
return new_filename
def generate_filename(
doc: Document,
*,
counter=0,
append_gpg=True,
archive_filename=False,
) -> Path:
base_path: Path | None = None
def format_filename(document: Document, template_str: str) -> str | None:
rendered_filename = validate_filepath_template_and_render(
template_str,
document,
)
if rendered_filename is None:
return None
# Apply this setting. It could become a filter in the future (or users could use |default)
if settings.FILENAME_FORMAT_REMOVE_NONE:
rendered_filename = rendered_filename.replace("/-none-/", "/")
rendered_filename = rendered_filename.replace(" -none-", "")
rendered_filename = rendered_filename.replace("-none-", "")
rendered_filename = rendered_filename.strip(os.sep)
rendered_filename = rendered_filename.replace(
"-none-",
"none",
) # backward compatibility
return rendered_filename
# Determine the source of the format string
if doc.storage_path is not None:
filename_format = doc.storage_path.path
elif settings.FILENAME_FORMAT is not None:
# Maybe convert old to new style
filename_format = convert_format_str_to_template_format(
settings.FILENAME_FORMAT,
)
else:
filename_format = None
# If we have one, render it
if filename_format is not None:
rendered_path: str | None = format_filename(doc, filename_format)
if rendered_path:
base_path = Path(rendered_path)
counter_str = f"_{counter:02}" if counter else ""
filetype_str = ".pdf" if archive_filename else doc.file_type
if base_path:
# Split the path into directory and filename parts
directory = base_path.parent
# Use the full name (not just stem) as the base filename
base_filename = base_path.name
# Build the final filename with counter and filetype
final_filename = f"{base_filename}{counter_str}{filetype_str}"
# If we have a directory component, include it
if str(directory) != ".":
full_path = directory / final_filename
else:
full_path = Path(final_filename)
else:
# No template, use document ID
final_filename = f"{doc.pk:07}{counter_str}{filetype_str}"
full_path = Path(final_filename)
# Add GPG extension if needed
if append_gpg and doc.storage_type == doc.STORAGE_TYPE_GPG:
full_path = full_path.with_suffix(full_path.suffix + ".gpg")
return full_path

971
src/documents/filters.py Normal file
View File

@@ -0,0 +1,971 @@
from __future__ import annotations
import functools
import inspect
import json
import operator
from contextlib import contextmanager
from typing import TYPE_CHECKING
from django.contrib.contenttypes.models import ContentType
from django.db.models import Case
from django.db.models import CharField
from django.db.models import Count
from django.db.models import Exists
from django.db.models import IntegerField
from django.db.models import OuterRef
from django.db.models import Q
from django.db.models import Subquery
from django.db.models import Sum
from django.db.models import Value
from django.db.models import When
from django.db.models.functions import Cast
from django.utils.translation import gettext_lazy as _
from django_filters import DateFilter
from django_filters.rest_framework import BooleanFilter
from django_filters.rest_framework import Filter
from django_filters.rest_framework import FilterSet
from drf_spectacular.utils import extend_schema_field
from guardian.utils import get_group_obj_perms_model
from guardian.utils import get_user_obj_perms_model
from rest_framework import serializers
from rest_framework.filters import OrderingFilter
from rest_framework_guardian.filters import ObjectPermissionsFilter
from documents.models import Correspondent
from documents.models import CustomField
from documents.models import CustomFieldInstance
from documents.models import Document
from documents.models import DocumentType
from documents.models import PaperlessTask
from documents.models import ShareLink
from documents.models import StoragePath
from documents.models import Tag
if TYPE_CHECKING:
from collections.abc import Callable
CHAR_KWARGS = ["istartswith", "iendswith", "icontains", "iexact"]
ID_KWARGS = ["in", "exact"]
INT_KWARGS = ["exact", "gt", "gte", "lt", "lte", "isnull"]
DATE_KWARGS = [
"year",
"month",
"day",
"gt",
"gte",
"lt",
"lte",
]
DATETIME_KWARGS = [
"year",
"month",
"day",
"date__gt",
"date__gte",
"gt",
"gte",
"date__lt",
"date__lte",
"lt",
"lte",
]
CUSTOM_FIELD_QUERY_MAX_DEPTH = 10
CUSTOM_FIELD_QUERY_MAX_ATOMS = 20
class CorrespondentFilterSet(FilterSet):
class Meta:
model = Correspondent
fields = {
"id": ID_KWARGS,
"name": CHAR_KWARGS,
}
class TagFilterSet(FilterSet):
class Meta:
model = Tag
fields = {
"id": ID_KWARGS,
"name": CHAR_KWARGS,
}
class DocumentTypeFilterSet(FilterSet):
class Meta:
model = DocumentType
fields = {
"id": ID_KWARGS,
"name": CHAR_KWARGS,
}
class StoragePathFilterSet(FilterSet):
class Meta:
model = StoragePath
fields = {
"id": ID_KWARGS,
"name": CHAR_KWARGS,
"path": CHAR_KWARGS,
}
class ObjectFilter(Filter):
def __init__(self, *, exclude=False, in_list=False, field_name=""):
super().__init__()
self.exclude = exclude
self.in_list = in_list
self.field_name = field_name
def filter(self, qs, value):
if not value:
return qs
try:
object_ids = [int(x) for x in value.split(",")]
except ValueError:
return qs
if self.in_list:
qs = qs.filter(**{f"{self.field_name}__id__in": object_ids}).distinct()
else:
for obj_id in object_ids:
if self.exclude:
qs = qs.exclude(**{f"{self.field_name}__id": obj_id})
else:
qs = qs.filter(**{f"{self.field_name}__id": obj_id})
return qs
@extend_schema_field(serializers.BooleanField)
class InboxFilter(Filter):
def filter(self, qs, value):
if value == "true":
return qs.filter(tags__is_inbox_tag=True)
elif value == "false":
return qs.exclude(tags__is_inbox_tag=True)
else:
return qs
@extend_schema_field(serializers.CharField)
class TitleContentFilter(Filter):
def filter(self, qs, value):
if value:
return qs.filter(Q(title__icontains=value) | Q(content__icontains=value))
else:
return qs
@extend_schema_field(serializers.BooleanField)
class SharedByUser(Filter):
def filter(self, qs, value):
ctype = ContentType.objects.get_for_model(self.model)
UserObjectPermission = get_user_obj_perms_model()
GroupObjectPermission = get_group_obj_perms_model()
# see https://github.com/paperless-ngx/paperless-ngx/issues/5392, we limit subqueries
# to 1 because Postgres doesn't like returning > 1 row, but all we care about is > 0
return (
qs.filter(
owner_id=value,
)
.annotate(
num_shared_users=Count(
UserObjectPermission.objects.filter(
content_type=ctype,
object_pk=Cast(OuterRef("pk"), CharField()),
).values("user_id")[:1],
),
)
.annotate(
num_shared_groups=Count(
GroupObjectPermission.objects.filter(
content_type=ctype,
object_pk=Cast(OuterRef("pk"), CharField()),
).values("group_id")[:1],
),
)
.filter(
Q(num_shared_users__gt=0) | Q(num_shared_groups__gt=0),
)
if value is not None
else qs
)
class CustomFieldFilterSet(FilterSet):
class Meta:
model = CustomField
fields = {
"id": ID_KWARGS,
"name": CHAR_KWARGS,
}
@extend_schema_field(serializers.CharField)
class CustomFieldsFilter(Filter):
def filter(self, qs, value):
if value:
fields_with_matching_selects = CustomField.objects.filter(
extra_data__icontains=value,
)
option_ids = []
if fields_with_matching_selects.count() > 0:
for field in fields_with_matching_selects:
options = field.extra_data.get("select_options", [])
for _, option in enumerate(options):
if option.get("label").lower().find(value.lower()) != -1:
option_ids.extend([option.get("id")])
return (
qs.filter(custom_fields__field__name__icontains=value)
| qs.filter(custom_fields__value_text__icontains=value)
| qs.filter(custom_fields__value_bool__icontains=value)
| qs.filter(custom_fields__value_int__icontains=value)
| qs.filter(custom_fields__value_float__icontains=value)
| qs.filter(custom_fields__value_date__icontains=value)
| qs.filter(custom_fields__value_url__icontains=value)
| qs.filter(custom_fields__value_monetary__icontains=value)
| qs.filter(custom_fields__value_document_ids__icontains=value)
| qs.filter(custom_fields__value_select__in=option_ids)
| qs.filter(custom_fields__value_long_text__icontains=value)
)
else:
return qs
class MimeTypeFilter(Filter):
def filter(self, qs, value):
if value:
return qs.filter(mime_type__icontains=value)
else:
return qs
class SelectField(serializers.CharField):
def __init__(self, custom_field: CustomField):
self._options = custom_field.extra_data["select_options"]
super().__init__(max_length=16)
def to_internal_value(self, data):
# If the supplied value is the option label instead of the ID
try:
data = next(
option.get("id")
for option in self._options
if option.get("label") == data
)
except StopIteration:
pass
return super().to_internal_value(data)
def handle_validation_prefix(func: Callable):
"""
Catch ValidationErrors raised by the wrapped function
and add a prefix to the exception detail to track what causes the exception,
similar to nested serializers.
"""
def wrapper(*args, validation_prefix=None, **kwargs):
try:
return func(*args, **kwargs)
except serializers.ValidationError as e:
raise serializers.ValidationError({validation_prefix: e.detail})
# Update the signature to include the validation_prefix argument
old_sig = inspect.signature(func)
new_param = inspect.Parameter("validation_prefix", inspect.Parameter.KEYWORD_ONLY)
new_sig = old_sig.replace(parameters=[*old_sig.parameters.values(), new_param])
# Apply functools.wraps and manually set the new signature
functools.update_wrapper(wrapper, func)
wrapper.__signature__ = new_sig
return wrapper
class CustomFieldQueryParser:
EXPR_BY_CATEGORY = {
"basic": ["exact", "in", "isnull", "exists"],
"string": [
"icontains",
"istartswith",
"iendswith",
],
"arithmetic": [
"gt",
"gte",
"lt",
"lte",
"range",
],
"containment": ["contains"],
}
SUPPORTED_EXPR_CATEGORIES = {
CustomField.FieldDataType.STRING: ("basic", "string"),
CustomField.FieldDataType.URL: ("basic", "string"),
CustomField.FieldDataType.DATE: ("basic", "arithmetic"),
CustomField.FieldDataType.BOOL: ("basic",),
CustomField.FieldDataType.INT: ("basic", "arithmetic"),
CustomField.FieldDataType.FLOAT: ("basic", "arithmetic"),
CustomField.FieldDataType.MONETARY: ("basic", "string", "arithmetic"),
CustomField.FieldDataType.DOCUMENTLINK: ("basic", "containment"),
CustomField.FieldDataType.SELECT: ("basic",),
CustomField.FieldDataType.LONG_TEXT: ("basic", "string"),
}
DATE_COMPONENTS = [
"year",
"iso_year",
"month",
"day",
"week",
"week_day",
"iso_week_day",
"quarter",
]
def __init__(
self,
validation_prefix,
max_query_depth=10,
max_atom_count=20,
) -> None:
"""
A helper class that parses the query string into a `django.db.models.Q` for filtering
documents based on custom field values.
The syntax of the query expression is illustrated with the below pseudo code rules:
1. parse([`custom_field`, "exists", true]):
matches documents with Q(custom_fields__field=`custom_field`)
2. parse([`custom_field`, "exists", false]):
matches documents with ~Q(custom_fields__field=`custom_field`)
3. parse([`custom_field`, `op`, `value`]):
matches documents with
Q(custom_fields__field=`custom_field`, custom_fields__value_`type`__`op`= `value`)
4. parse(["AND", [`q0`, `q1`, ..., `qn`]])
-> parse(`q0`) & parse(`q1`) & ... & parse(`qn`)
5. parse(["OR", [`q0`, `q1`, ..., `qn`]])
-> parse(`q0`) | parse(`q1`) | ... | parse(`qn`)
6. parse(["NOT", `q`])
-> ~parse(`q`)
Args:
validation_prefix: Used to generate the ValidationError message.
max_query_depth: Limits the maximum nesting depth of queries.
max_atom_count: Limits the maximum number of atoms (i.e., rule 1, 2, 3) in the query.
`max_query_depth` and `max_atom_count` can be set to guard against generating arbitrarily
complex SQL queries.
"""
self._custom_fields: dict[int | str, CustomField] = {}
self._validation_prefix = validation_prefix
# Dummy ModelSerializer used to convert a Django models.Field to serializers.Field.
self._model_serializer = serializers.ModelSerializer()
# Used for sanity check
self._max_query_depth = max_query_depth
self._max_atom_count = max_atom_count
self._current_depth = 0
self._atom_count = 0
# The set of annotations that we need to apply to the queryset
self._annotations = {}
def parse(self, query: str) -> tuple[Q, dict[str, Count]]:
"""
Parses the query string into a `django.db.models.Q`
and a set of annotations to be applied to the queryset.
"""
try:
expr = json.loads(query)
except json.JSONDecodeError:
raise serializers.ValidationError(
{self._validation_prefix: [_("Value must be valid JSON.")]},
)
return (
self._parse_expr(expr, validation_prefix=self._validation_prefix),
self._annotations,
)
@handle_validation_prefix
def _parse_expr(self, expr) -> Q:
"""
Applies rule (1, 2, 3) or (4, 5, 6) based on the length of the expr.
"""
with self._track_query_depth():
if isinstance(expr, list | tuple):
if len(expr) == 2:
return self._parse_logical_expr(*expr)
elif len(expr) == 3:
return self._parse_atom(*expr)
raise serializers.ValidationError(
[_("Invalid custom field query expression")],
)
@handle_validation_prefix
def _parse_expr_list(self, exprs) -> list[Q]:
"""
Handles [`q0`, `q1`, ..., `qn`] in rule 4 & 5.
"""
if not isinstance(exprs, list | tuple) or not exprs:
raise serializers.ValidationError(
[_("Invalid expression list. Must be nonempty.")],
)
return [
self._parse_expr(expr, validation_prefix=i) for i, expr in enumerate(exprs)
]
def _parse_logical_expr(self, op, args) -> Q:
"""
Handles rule 4, 5, 6.
"""
op_lower = op.lower()
if op_lower == "not":
return ~self._parse_expr(args, validation_prefix=1)
if op_lower == "and":
op_func = operator.and_
elif op_lower == "or":
op_func = operator.or_
else:
raise serializers.ValidationError(
{"0": [_("Invalid logical operator {op!r}").format(op=op)]},
)
qs = self._parse_expr_list(args, validation_prefix="1")
return functools.reduce(op_func, qs)
def _parse_atom(self, id_or_name, op, value) -> Q:
"""
Handles rule 1, 2, 3.
"""
# Guard against queries with too many conditions.
self._atom_count += 1
if self._atom_count > self._max_atom_count:
raise serializers.ValidationError(
[_("Maximum number of query conditions exceeded.")],
)
custom_field = self._get_custom_field(id_or_name, validation_prefix="0")
op = self._validate_atom_op(custom_field, op, validation_prefix="1")
value = self._validate_atom_value(
custom_field,
op,
value,
validation_prefix="2",
)
# Needed because not all DB backends support Array __contains
if (
custom_field.data_type == CustomField.FieldDataType.DOCUMENTLINK
and op == "contains"
):
return self._parse_atom_doc_link_contains(custom_field, value)
value_field_name = CustomFieldInstance.get_value_field_name(
custom_field.data_type,
)
if (
custom_field.data_type == CustomField.FieldDataType.MONETARY
and op in self.EXPR_BY_CATEGORY["arithmetic"]
):
value_field_name = "value_monetary_amount"
has_field = Q(custom_fields__field=custom_field)
# We need to use an annotation here because different atoms
# might be referring to different instances of custom fields.
annotation_name = f"_custom_field_filter_{len(self._annotations)}"
# Our special exists operator.
if op == "exists":
annotation = Count("custom_fields", filter=has_field)
# A Document should have > 0 match if it has this field, or 0 if doesn't.
query_op = "gt" if value else "exact"
query = Q(**{f"{annotation_name}__{query_op}": 0})
else:
# Check if 1) custom field name matches, and 2) value satisfies condition
field_filter = has_field & Q(
**{f"custom_fields__{value_field_name}__{op}": value},
)
# Annotate how many matching custom fields each document has
annotation = Count("custom_fields", filter=field_filter)
# Filter document by count
query = Q(**{f"{annotation_name}__gt": 0})
self._annotations[annotation_name] = annotation
return query
@handle_validation_prefix
def _get_custom_field(self, id_or_name):
"""Get the CustomField instance by id or name."""
if id_or_name in self._custom_fields:
return self._custom_fields[id_or_name]
kwargs = (
{"id": id_or_name} if isinstance(id_or_name, int) else {"name": id_or_name}
)
try:
custom_field = CustomField.objects.get(**kwargs)
except CustomField.DoesNotExist:
raise serializers.ValidationError(
[_("{name!r} is not a valid custom field.").format(name=id_or_name)],
)
self._custom_fields[custom_field.id] = custom_field
self._custom_fields[custom_field.name] = custom_field
return custom_field
@staticmethod
def _split_op(full_op):
*prefix, op = str(full_op).rsplit("__", maxsplit=1)
prefix = prefix[0] if prefix else None
return prefix, op
@handle_validation_prefix
def _validate_atom_op(self, custom_field, raw_op):
"""Check if the `op` is compatible with the type of the custom field."""
prefix, op = self._split_op(raw_op)
# Check if the operator is supported for the current data_type.
supported = False
for category in self.SUPPORTED_EXPR_CATEGORIES[custom_field.data_type]:
if op in self.EXPR_BY_CATEGORY[category]:
supported = True
break
# Check prefix
if prefix is not None:
if (
prefix in self.DATE_COMPONENTS
and custom_field.data_type == CustomField.FieldDataType.DATE
):
pass # ok - e.g., "year__exact" for date field
else:
supported = False # anything else is invalid
if not supported:
raise serializers.ValidationError(
[
_("{data_type} does not support query expr {expr!r}.").format(
data_type=custom_field.data_type,
expr=raw_op,
),
],
)
return raw_op
def _get_serializer_field(self, custom_field, full_op):
"""Return a serializers.Field for value validation."""
prefix, op = self._split_op(full_op)
field = None
if op in ("isnull", "exists"):
# `isnull` takes either True or False regardless of the data_type.
field = serializers.BooleanField()
elif (
custom_field.data_type == CustomField.FieldDataType.DATE
and prefix in self.DATE_COMPONENTS
):
# DateField admits queries in the form of `year__exact`, etc. These take integers.
field = serializers.IntegerField()
elif custom_field.data_type == CustomField.FieldDataType.DOCUMENTLINK:
# We can be more specific here and make sure the value is a list.
field = serializers.ListField(child=serializers.IntegerField())
elif custom_field.data_type == CustomField.FieldDataType.SELECT:
# We use this custom field to permit SELECT option names.
field = SelectField(custom_field)
elif custom_field.data_type == CustomField.FieldDataType.URL:
# For URL fields we don't need to be strict about validation (e.g., for istartswith).
field = serializers.CharField()
else:
# The general case: inferred from the corresponding field in CustomFieldInstance.
value_field_name = CustomFieldInstance.get_value_field_name(
custom_field.data_type,
)
model_field = CustomFieldInstance._meta.get_field(value_field_name)
field_name = model_field.deconstruct()[0]
field_class, field_kwargs = self._model_serializer.build_standard_field(
field_name,
model_field,
)
field = field_class(**field_kwargs)
field.allow_null = False
# Need to set allow_blank manually because of the inconsistency in CustomFieldInstance validation.
# See https://github.com/paperless-ngx/paperless-ngx/issues/7361.
if isinstance(field, serializers.CharField):
field.allow_blank = True
if op == "in":
# `in` takes a list of values.
field = serializers.ListField(child=field, allow_empty=False)
elif op == "range":
# `range` takes a list of values, i.e., [start, end].
field = serializers.ListField(
child=field,
min_length=2,
max_length=2,
)
return field
@handle_validation_prefix
def _validate_atom_value(self, custom_field, op, value):
"""Check if `value` is valid for the custom field and `op`. Returns the validated value."""
serializer_field = self._get_serializer_field(custom_field, op)
return serializer_field.run_validation(value)
def _parse_atom_doc_link_contains(self, custom_field, value) -> Q:
"""
Handles document link `contains` in a way that is supported by all DB backends.
"""
# If the value is an empty set,
# this is trivially true for any document with not null document links.
if not value:
return Q(
custom_fields__field=custom_field,
custom_fields__value_document_ids__isnull=False,
)
# First we look up reverse links from the requested documents.
links = CustomFieldInstance.objects.filter(
document_id__in=value,
field__data_type=CustomField.FieldDataType.DOCUMENTLINK,
)
# Check if any of the requested IDs are missing.
missing_ids = set(value) - set(link.document_id for link in links)
if missing_ids:
# The result should be an empty set in this case.
return Q(id__in=[])
# Take the intersection of the reverse links - this should be what we are looking for.
document_ids_we_want = functools.reduce(
operator.and_,
(set(link.value_document_ids) for link in links),
)
return Q(id__in=document_ids_we_want)
@contextmanager
def _track_query_depth(self):
# guard against queries that are too deeply nested
self._current_depth += 1
if self._current_depth > self._max_query_depth:
raise serializers.ValidationError([_("Maximum nesting depth exceeded.")])
try:
yield
finally:
self._current_depth -= 1
@extend_schema_field(serializers.CharField)
class CustomFieldQueryFilter(Filter):
def __init__(self, validation_prefix):
"""
A filter that filters documents based on custom field name and value.
Args:
validation_prefix: Used to generate the ValidationError message.
"""
super().__init__()
self._validation_prefix = validation_prefix
def filter(self, qs, value):
if not value:
return qs
parser = CustomFieldQueryParser(
self._validation_prefix,
max_query_depth=CUSTOM_FIELD_QUERY_MAX_DEPTH,
max_atom_count=CUSTOM_FIELD_QUERY_MAX_ATOMS,
)
q, annotations = parser.parse(value)
return qs.annotate(**annotations).filter(q)
class DocumentFilterSet(FilterSet):
is_tagged = BooleanFilter(
label="Is tagged",
field_name="tags",
lookup_expr="isnull",
exclude=True,
)
tags__id__all = ObjectFilter(field_name="tags")
tags__id__none = ObjectFilter(field_name="tags", exclude=True)
tags__id__in = ObjectFilter(field_name="tags", in_list=True)
correspondent__id__none = ObjectFilter(field_name="correspondent", exclude=True)
document_type__id__none = ObjectFilter(field_name="document_type", exclude=True)
storage_path__id__none = ObjectFilter(field_name="storage_path", exclude=True)
is_in_inbox = InboxFilter()
title_content = TitleContentFilter()
owner__id__none = ObjectFilter(field_name="owner", exclude=True)
custom_fields__icontains = CustomFieldsFilter()
custom_fields__id__all = ObjectFilter(field_name="custom_fields__field")
custom_fields__id__none = ObjectFilter(
field_name="custom_fields__field",
exclude=True,
)
custom_fields__id__in = ObjectFilter(
field_name="custom_fields__field",
in_list=True,
)
has_custom_fields = BooleanFilter(
label="Has custom field",
field_name="custom_fields",
lookup_expr="isnull",
exclude=True,
)
custom_field_query = CustomFieldQueryFilter("custom_field_query")
shared_by__id = SharedByUser()
mime_type = MimeTypeFilter()
# Backwards compatibility
created__date__gt = DateFilter(field_name="created", lookup_expr="gt")
created__date__gte = DateFilter(field_name="created", lookup_expr="gte")
created__date__lt = DateFilter(field_name="created", lookup_expr="lt")
created__date__lte = DateFilter(field_name="created", lookup_expr="lte")
class Meta:
model = Document
fields = {
"id": ID_KWARGS,
"title": CHAR_KWARGS,
"content": CHAR_KWARGS,
"archive_serial_number": INT_KWARGS,
"created": DATE_KWARGS,
"added": DATETIME_KWARGS,
"modified": DATETIME_KWARGS,
"original_filename": CHAR_KWARGS,
"checksum": CHAR_KWARGS,
"correspondent": ["isnull"],
"correspondent__id": ID_KWARGS,
"correspondent__name": CHAR_KWARGS,
"tags__id": ID_KWARGS,
"tags__name": CHAR_KWARGS,
"document_type": ["isnull"],
"document_type__id": ID_KWARGS,
"document_type__name": CHAR_KWARGS,
"storage_path": ["isnull"],
"storage_path__id": ID_KWARGS,
"storage_path__name": CHAR_KWARGS,
"owner": ["isnull"],
"owner__id": ID_KWARGS,
"custom_fields": ["icontains"],
}
class ShareLinkFilterSet(FilterSet):
class Meta:
model = ShareLink
fields = {
"created": DATETIME_KWARGS,
"expiration": DATETIME_KWARGS,
}
class PaperlessTaskFilterSet(FilterSet):
acknowledged = BooleanFilter(
label="Acknowledged",
field_name="acknowledged",
)
class Meta:
model = PaperlessTask
fields = {
"type": ["exact"],
"task_name": ["exact"],
"status": ["exact"],
}
class ObjectOwnedOrGrantedPermissionsFilter(ObjectPermissionsFilter):
"""
A filter backend that limits results to those where the requesting user
has read object level permissions, owns the objects, or objects without
an owner (for backwards compat)
"""
def filter_queryset(self, request, queryset, view):
objects_with_perms = super().filter_queryset(request, queryset, view)
objects_owned = queryset.filter(owner=request.user)
objects_unowned = queryset.filter(owner__isnull=True)
return objects_with_perms | objects_owned | objects_unowned
class ObjectOwnedPermissionsFilter(ObjectPermissionsFilter):
"""
A filter backend that limits results to those where the requesting user
owns the objects or objects without an owner (for backwards compat)
"""
def filter_queryset(self, request, queryset, view):
if request.user.is_superuser:
return queryset
objects_owned = queryset.filter(owner=request.user)
objects_unowned = queryset.filter(owner__isnull=True)
return objects_owned | objects_unowned
class DocumentsOrderingFilter(OrderingFilter):
field_name = "ordering"
prefix = "custom_field_"
def filter_queryset(self, request, queryset, view):
param = request.query_params.get("ordering")
if param and self.prefix in param:
custom_field_id = int(param.split(self.prefix)[1])
try:
field = CustomField.objects.get(pk=custom_field_id)
except CustomField.DoesNotExist:
raise serializers.ValidationError(
{self.prefix + str(custom_field_id): [_("Custom field not found")]},
)
annotation = None
match field.data_type:
case (
CustomField.FieldDataType.STRING
| CustomField.FieldDataType.LONG_TEXT
):
annotation = Subquery(
CustomFieldInstance.objects.filter(
document_id=OuterRef("id"),
field_id=custom_field_id,
).values("value_text")[:1],
)
case CustomField.FieldDataType.INT:
annotation = Subquery(
CustomFieldInstance.objects.filter(
document_id=OuterRef("id"),
field_id=custom_field_id,
).values("value_int")[:1],
)
case CustomField.FieldDataType.FLOAT:
annotation = Subquery(
CustomFieldInstance.objects.filter(
document_id=OuterRef("id"),
field_id=custom_field_id,
).values("value_float")[:1],
)
case CustomField.FieldDataType.DATE:
annotation = Subquery(
CustomFieldInstance.objects.filter(
document_id=OuterRef("id"),
field_id=custom_field_id,
).values("value_date")[:1],
)
case CustomField.FieldDataType.MONETARY:
annotation = Subquery(
CustomFieldInstance.objects.filter(
document_id=OuterRef("id"),
field_id=custom_field_id,
).values("value_monetary_amount")[:1],
)
case CustomField.FieldDataType.SELECT:
# Select options are a little more complicated since the value is the id of the option, not
# the label. Additionally, to support sqlite we can't use StringAgg, so we need to create a
# case statement for each option, setting the value to the index of the option in a list
# sorted by label, and then summing the results to give a single value for the annotation
select_options = sorted(
field.extra_data.get("select_options", []),
key=lambda x: x.get("label"),
)
whens = [
When(
custom_fields__field_id=custom_field_id,
custom_fields__value_select=option.get("id"),
then=Value(idx, output_field=IntegerField()),
)
for idx, option in enumerate(select_options)
]
whens.append(
When(
custom_fields__field_id=custom_field_id,
custom_fields__value_select__isnull=True,
then=Value(
len(select_options),
output_field=IntegerField(),
),
),
)
annotation = Sum(
Case(
*whens,
default=Value(0),
output_field=IntegerField(),
),
)
case CustomField.FieldDataType.DOCUMENTLINK:
annotation = Subquery(
CustomFieldInstance.objects.filter(
document_id=OuterRef("id"),
field_id=custom_field_id,
).values("value_document_ids")[:1],
)
case CustomField.FieldDataType.URL:
annotation = Subquery(
CustomFieldInstance.objects.filter(
document_id=OuterRef("id"),
field_id=custom_field_id,
).values("value_url")[:1],
)
case CustomField.FieldDataType.BOOL:
annotation = Subquery(
CustomFieldInstance.objects.filter(
document_id=OuterRef("id"),
field_id=custom_field_id,
).values("value_bool")[:1],
)
if not annotation:
# Only happens if a new data type is added and not handled here
raise ValueError("Invalid custom field data type")
queryset = (
queryset.annotate(
# We need to annotate the queryset with the custom field value
custom_field_value=annotation,
# We also need to annotate the queryset with a boolean for sorting whether the field exists
has_field=Exists(
CustomFieldInstance.objects.filter(
document_id=OuterRef("id"),
field_id=custom_field_id,
),
),
)
.order_by(
"-has_field",
param.replace(
self.prefix + str(custom_field_id),
"custom_field_value",
),
)
.distinct()
)
return super().filter_queryset(request, queryset, view)

492
src/documents/index.py Normal file
View File

@@ -0,0 +1,492 @@
from __future__ import annotations
import logging
import math
import re
from collections import Counter
from contextlib import contextmanager
from datetime import datetime
from datetime import time
from datetime import timedelta
from datetime import timezone
from shutil import rmtree
from typing import TYPE_CHECKING
from typing import Literal
from django.conf import settings
from django.utils import timezone as django_timezone
from django.utils.timezone import get_current_timezone
from django.utils.timezone import now
from guardian.shortcuts import get_users_with_perms
from whoosh import classify
from whoosh import highlight
from whoosh import query
from whoosh.fields import BOOLEAN
from whoosh.fields import DATETIME
from whoosh.fields import KEYWORD
from whoosh.fields import NUMERIC
from whoosh.fields import TEXT
from whoosh.fields import Schema
from whoosh.highlight import HtmlFormatter
from whoosh.idsets import BitSet
from whoosh.idsets import DocIdSet
from whoosh.index import FileIndex
from whoosh.index import create_in
from whoosh.index import exists_in
from whoosh.index import open_dir
from whoosh.qparser import MultifieldParser
from whoosh.qparser import QueryParser
from whoosh.qparser.dateparse import DateParserPlugin
from whoosh.qparser.dateparse import English
from whoosh.qparser.plugins import FieldsPlugin
from whoosh.scoring import TF_IDF
from whoosh.util.times import timespan
from whoosh.writing import AsyncWriter
from documents.models import CustomFieldInstance
from documents.models import Document
from documents.models import Note
from documents.models import User
if TYPE_CHECKING:
from django.db.models import QuerySet
from whoosh.reading import IndexReader
from whoosh.searching import ResultsPage
from whoosh.searching import Searcher
logger = logging.getLogger("paperless.index")
def get_schema() -> Schema:
return Schema(
id=NUMERIC(stored=True, unique=True),
title=TEXT(sortable=True),
content=TEXT(),
asn=NUMERIC(sortable=True, signed=False),
correspondent=TEXT(sortable=True),
correspondent_id=NUMERIC(),
has_correspondent=BOOLEAN(),
tag=KEYWORD(commas=True, scorable=True, lowercase=True),
tag_id=KEYWORD(commas=True, scorable=True),
has_tag=BOOLEAN(),
type=TEXT(sortable=True),
type_id=NUMERIC(),
has_type=BOOLEAN(),
created=DATETIME(sortable=True),
modified=DATETIME(sortable=True),
added=DATETIME(sortable=True),
path=TEXT(sortable=True),
path_id=NUMERIC(),
has_path=BOOLEAN(),
notes=TEXT(),
num_notes=NUMERIC(sortable=True, signed=False),
custom_fields=TEXT(),
custom_field_count=NUMERIC(sortable=True, signed=False),
has_custom_fields=BOOLEAN(),
custom_fields_id=KEYWORD(commas=True),
owner=TEXT(),
owner_id=NUMERIC(),
has_owner=BOOLEAN(),
viewer_id=KEYWORD(commas=True),
checksum=TEXT(),
page_count=NUMERIC(sortable=True),
original_filename=TEXT(sortable=True),
is_shared=BOOLEAN(),
)
def open_index(*, recreate=False) -> FileIndex:
try:
if exists_in(settings.INDEX_DIR) and not recreate:
return open_dir(settings.INDEX_DIR, schema=get_schema())
except Exception:
logger.exception("Error while opening the index, recreating.")
# create_in doesn't handle corrupted indexes very well, remove the directory entirely first
if settings.INDEX_DIR.is_dir():
rmtree(settings.INDEX_DIR)
settings.INDEX_DIR.mkdir(parents=True, exist_ok=True)
return create_in(settings.INDEX_DIR, get_schema())
@contextmanager
def open_index_writer(*, optimize=False) -> AsyncWriter:
writer = AsyncWriter(open_index())
try:
yield writer
except Exception as e:
logger.exception(str(e))
writer.cancel()
finally:
writer.commit(optimize=optimize)
@contextmanager
def open_index_searcher() -> Searcher:
searcher = open_index().searcher()
try:
yield searcher
finally:
searcher.close()
def update_document(writer: AsyncWriter, doc: Document) -> None:
tags = ",".join([t.name for t in doc.tags.all()])
tags_ids = ",".join([str(t.id) for t in doc.tags.all()])
notes = ",".join([str(c.note) for c in Note.objects.filter(document=doc)])
custom_fields = ",".join(
[str(c) for c in CustomFieldInstance.objects.filter(document=doc)],
)
custom_fields_ids = ",".join(
[str(f.field.id) for f in CustomFieldInstance.objects.filter(document=doc)],
)
asn: int | None = doc.archive_serial_number
if asn is not None and (
asn < Document.ARCHIVE_SERIAL_NUMBER_MIN
or asn > Document.ARCHIVE_SERIAL_NUMBER_MAX
):
logger.error(
f"Not indexing Archive Serial Number {asn} of document {doc.pk}. "
f"ASN is out of range "
f"[{Document.ARCHIVE_SERIAL_NUMBER_MIN:,}, "
f"{Document.ARCHIVE_SERIAL_NUMBER_MAX:,}.",
)
asn = 0
users_with_perms = get_users_with_perms(
doc,
only_with_perms_in=["view_document"],
)
viewer_ids: str = ",".join([str(u.id) for u in users_with_perms])
writer.update_document(
id=doc.pk,
title=doc.title,
content=doc.content,
correspondent=doc.correspondent.name if doc.correspondent else None,
correspondent_id=doc.correspondent.id if doc.correspondent else None,
has_correspondent=doc.correspondent is not None,
tag=tags if tags else None,
tag_id=tags_ids if tags_ids else None,
has_tag=len(tags) > 0,
type=doc.document_type.name if doc.document_type else None,
type_id=doc.document_type.id if doc.document_type else None,
has_type=doc.document_type is not None,
created=datetime.combine(doc.created, time.min),
added=doc.added,
asn=asn,
modified=doc.modified,
path=doc.storage_path.name if doc.storage_path else None,
path_id=doc.storage_path.id if doc.storage_path else None,
has_path=doc.storage_path is not None,
notes=notes,
num_notes=len(notes),
custom_fields=custom_fields,
custom_field_count=len(doc.custom_fields.all()),
has_custom_fields=len(custom_fields) > 0,
custom_fields_id=custom_fields_ids if custom_fields_ids else None,
owner=doc.owner.username if doc.owner else None,
owner_id=doc.owner.id if doc.owner else None,
has_owner=doc.owner is not None,
viewer_id=viewer_ids if viewer_ids else None,
checksum=doc.checksum,
page_count=doc.page_count,
original_filename=doc.original_filename,
is_shared=len(viewer_ids) > 0,
)
logger.debug(f"Index updated for document {doc.pk}.")
def remove_document(writer: AsyncWriter, doc: Document) -> None:
remove_document_by_id(writer, doc.pk)
def remove_document_by_id(writer: AsyncWriter, doc_id) -> None:
writer.delete_by_term("id", doc_id)
def add_or_update_document(document: Document) -> None:
with open_index_writer() as writer:
update_document(writer, document)
def remove_document_from_index(document: Document) -> None:
with open_index_writer() as writer:
remove_document(writer, document)
class MappedDocIdSet(DocIdSet):
"""
A DocIdSet backed by a set of `Document` IDs.
Supports efficiently looking up if a whoosh docnum is in the provided `filter_queryset`.
"""
def __init__(self, filter_queryset: QuerySet, ixreader: IndexReader) -> None:
super().__init__()
document_ids = filter_queryset.order_by("id").values_list("id", flat=True)
max_id = document_ids.last() or 0
self.document_ids = BitSet(document_ids, size=max_id)
self.ixreader = ixreader
def __contains__(self, docnum) -> bool:
document_id = self.ixreader.stored_fields(docnum)["id"]
return document_id in self.document_ids
def __bool__(self) -> Literal[True]:
# searcher.search ignores a filter if it's "falsy".
# We use this hack so this DocIdSet, when used as a filter, is never ignored.
return True
class DelayedQuery:
def _get_query(self):
raise NotImplementedError # pragma: no cover
def _get_query_sortedby(self) -> tuple[None, Literal[False]] | tuple[str, bool]:
if "ordering" not in self.query_params:
return None, False
field: str = self.query_params["ordering"]
sort_fields_map: dict[str, str] = {
"created": "created",
"modified": "modified",
"added": "added",
"title": "title",
"correspondent__name": "correspondent",
"document_type__name": "type",
"archive_serial_number": "asn",
"num_notes": "num_notes",
"owner": "owner",
"page_count": "page_count",
}
if field.startswith("-"):
field = field[1:]
reverse = True
else:
reverse = False
if field not in sort_fields_map:
return None, False
else:
return sort_fields_map[field], reverse
def __init__(
self,
searcher: Searcher,
query_params,
page_size,
filter_queryset: QuerySet,
) -> None:
self.searcher = searcher
self.query_params = query_params
self.page_size = page_size
self.saved_results = dict()
self.first_score = None
self.filter_queryset = filter_queryset
self.suggested_correction = None
def __len__(self) -> int:
page = self[0:1]
return len(page)
def __getitem__(self, item):
if item.start in self.saved_results:
return self.saved_results[item.start]
q, mask, suggested_correction = self._get_query()
self.suggested_correction = suggested_correction
sortedby, reverse = self._get_query_sortedby()
page: ResultsPage = self.searcher.search_page(
q,
mask=mask,
filter=MappedDocIdSet(self.filter_queryset, self.searcher.ixreader),
pagenum=math.floor(item.start / self.page_size) + 1,
pagelen=self.page_size,
sortedby=sortedby,
reverse=reverse,
)
page.results.fragmenter = highlight.ContextFragmenter(surround=50)
page.results.formatter = HtmlFormatter(tagname="span", between=" ... ")
if not self.first_score and len(page.results) > 0 and sortedby is None:
self.first_score = page.results[0].score
page.results.top_n = list(
map(
lambda hit: (
(hit[0] / self.first_score) if self.first_score else None,
hit[1],
),
page.results.top_n,
),
)
self.saved_results[item.start] = page
return page
class LocalDateParser(English):
def reverse_timezone_offset(self, d):
return (d.replace(tzinfo=django_timezone.get_current_timezone())).astimezone(
timezone.utc,
)
def date_from(self, *args, **kwargs):
d = super().date_from(*args, **kwargs)
if isinstance(d, timespan):
d.start = self.reverse_timezone_offset(d.start)
d.end = self.reverse_timezone_offset(d.end)
elif isinstance(d, datetime):
d = self.reverse_timezone_offset(d)
return d
class DelayedFullTextQuery(DelayedQuery):
def _get_query(self) -> tuple:
q_str = self.query_params["query"]
q_str = rewrite_natural_date_keywords(q_str)
qp = MultifieldParser(
[
"content",
"title",
"correspondent",
"tag",
"type",
"notes",
"custom_fields",
],
self.searcher.ixreader.schema,
)
qp.add_plugin(
DateParserPlugin(
basedate=django_timezone.now(),
dateparser=LocalDateParser(),
),
)
q = qp.parse(q_str)
suggested_correction = None
try:
corrected = self.searcher.correct_query(q, q_str)
if corrected.string != q_str:
suggested_correction = corrected.string
except Exception as e:
logger.info(
"Error while correcting query %s: %s",
f"{q_str!r}",
e,
)
return q, None, suggested_correction
class DelayedMoreLikeThisQuery(DelayedQuery):
def _get_query(self) -> tuple:
more_like_doc_id = int(self.query_params["more_like_id"])
content = Document.objects.get(id=more_like_doc_id).content
docnum = self.searcher.document_number(id=more_like_doc_id)
kts = self.searcher.key_terms_from_text(
"content",
content,
numterms=20,
model=classify.Bo1Model,
normalize=False,
)
q = query.Or(
[query.Term("content", word, boost=weight) for word, weight in kts],
)
mask: set = {docnum}
return q, mask, None
def autocomplete(
ix: FileIndex,
term: str,
limit: int = 10,
user: User | None = None,
) -> list:
"""
Mimics whoosh.reading.IndexReader.most_distinctive_terms with permissions
and without scoring
"""
terms = []
with ix.searcher(weighting=TF_IDF()) as s:
qp = QueryParser("content", schema=ix.schema)
# Don't let searches with a query that happen to match a field override the
# content field query instead and return bogus, not text data
qp.remove_plugin_class(FieldsPlugin)
q = qp.parse(f"{term.lower()}*")
user_criterias: list = get_permissions_criterias(user)
results = s.search(
q,
terms=True,
filter=query.Or(user_criterias) if user_criterias is not None else None,
)
termCounts = Counter()
if results.has_matched_terms():
for hit in results:
for _, match in hit.matched_terms():
termCounts[match] += 1
terms = [t for t, _ in termCounts.most_common(limit)]
term_encoded: bytes = term.encode("UTF-8")
if term_encoded in terms:
terms.insert(0, terms.pop(terms.index(term_encoded)))
return terms
def get_permissions_criterias(user: User | None = None) -> list:
user_criterias = [query.Term("has_owner", text=False)]
if user is not None:
if user.is_superuser: # superusers see all docs
user_criterias = []
else:
user_criterias.append(query.Term("owner_id", user.id))
user_criterias.append(
query.Term("viewer_id", str(user.id)),
)
return user_criterias
def rewrite_natural_date_keywords(query_string: str) -> str:
"""
Rewrites natural date keywords (e.g. added:today or added:"yesterday") to UTC range syntax for Whoosh.
"""
tz = get_current_timezone()
local_now = now().astimezone(tz)
today = local_now.date()
yesterday = today - timedelta(days=1)
ranges = {
"today": (
datetime.combine(today, time.min, tzinfo=tz),
datetime.combine(today, time.max, tzinfo=tz),
),
"yesterday": (
datetime.combine(yesterday, time.min, tzinfo=tz),
datetime.combine(yesterday, time.max, tzinfo=tz),
),
}
pattern = r"(\b(?:added|created))\s*:\s*[\"']?(today|yesterday)[\"']?"
def repl(m):
field, keyword = m.group(1), m.group(2)
start, end = ranges[keyword]
start_str = start.astimezone(timezone.utc).strftime("%Y%m%d%H%M%S")
end_str = end.astimezone(timezone.utc).strftime("%Y%m%d%H%M%S")
return f"{field}:[{start_str} TO {end_str}]"
return re.sub(pattern, repl, query_string)

15
src/documents/loggers.py Normal file
View File

@@ -0,0 +1,15 @@
import logging
import uuid
class LoggingMixin:
def renew_logging_group(self):
"""
Creates a new UUID to group subsequent log calls together with
the extra data named group
"""
self.logging_group = uuid.uuid4()
self.log = logging.LoggerAdapter(
logging.getLogger(self.logging_name),
extra={"group": self.logging_group},
)

38
src/documents/mail.py Normal file
View File

@@ -0,0 +1,38 @@
from email import message_from_bytes
from pathlib import Path
from django.conf import settings
from django.core.mail import EmailMessage
from filelock import FileLock
def send_email(
subject: str,
body: str,
to: list[str],
attachment: Path | None = None,
attachment_mime_type: str | None = None,
) -> int:
"""
Send an email with an optional attachment.
TODO: re-evaluate this pending https://code.djangoproject.com/ticket/35581 / https://github.com/django/django/pull/18966
"""
email = EmailMessage(
subject=subject,
body=body,
to=to,
)
if attachment:
# Something could be renaming the file concurrently so it can't be attached
with FileLock(settings.MEDIA_LOCK), attachment.open("rb") as f:
content = f.read()
if attachment_mime_type == "message/rfc822":
# See https://forum.djangoproject.com/t/using-emailmessage-with-an-attached-email-file-crashes-due-to-non-ascii/37981
content = message_from_bytes(f.read())
email.attach(
filename=attachment.name,
content=content,
mimetype=attachment_mime_type,
)
return email.send()

View File

View File

@@ -0,0 +1,36 @@
from django.core.management.base import BaseCommand
from django.db import connection
from django.db import models
from documents.models import Document
class Command(BaseCommand):
# This code is taken almost entirely from https://github.com/wagtail/wagtail/pull/11912 with all credit to the original author.
help = "Converts UUID columns from char type to the native UUID type used in MariaDB 10.7+ and Django 5.0+."
def convert_field(self, model, field_name, *, null=False):
if model._meta.get_field(field_name).model != model: # pragma: no cover
# Field is inherited from a parent model
return
if not model._meta.managed: # pragma: no cover
# The migration framework skips unmanaged models, so we should too
return
old_field = models.CharField(null=null, max_length=36)
old_field.set_attributes_from_name(field_name)
new_field = models.UUIDField(null=null)
new_field.set_attributes_from_name(field_name)
with connection.schema_editor() as schema_editor:
schema_editor.alter_field(model, old_field, new_field)
self.stdout.write(
self.style.SUCCESS(
f"Successfully converted {model._meta.label} {field_name} field to UUID type.",
),
)
def handle(self, **options):
self.convert_field(Document, "transaction_id", null=True)

View File

@@ -0,0 +1,93 @@
from pathlib import Path
from django.conf import settings
from django.core.management.base import BaseCommand
from django.core.management.base import CommandError
from documents.models import Document
from paperless.db import GnuPG
class Command(BaseCommand):
help = (
"This is how you migrate your stored documents from an encrypted "
"state to an unencrypted one (or vice-versa)"
)
def add_arguments(self, parser) -> None:
parser.add_argument(
"--passphrase",
help=(
"If PAPERLESS_PASSPHRASE isn't set already, you need to specify it here"
),
)
def handle(self, *args, **options) -> None:
try:
self.stdout.write(
self.style.WARNING(
"\n\n"
"WARNING: This script is going to work directly on your "
"document originals, so\n"
"WARNING: you probably shouldn't run "
"this unless you've got a recent backup\n"
"WARNING: handy. It "
"*should* work without a hitch, but be safe and backup your\n"
"WARNING: stuff first.\n\n"
"Hit Ctrl+C to exit now, or Enter to "
"continue.\n\n",
),
)
_ = input()
except KeyboardInterrupt:
return
passphrase = options["passphrase"] or settings.PASSPHRASE
if not passphrase:
raise CommandError(
"Passphrase not defined. Please set it with --passphrase or "
"by declaring it in your environment or your config.",
)
self.__gpg_to_unencrypted(passphrase)
def __gpg_to_unencrypted(self, passphrase: str) -> None:
encrypted_files = Document.objects.filter(
storage_type=Document.STORAGE_TYPE_GPG,
)
for document in encrypted_files:
self.stdout.write(f"Decrypting {document}")
old_paths = [document.source_path, document.thumbnail_path]
with document.source_file as file_handle:
raw_document = GnuPG.decrypted(file_handle, passphrase)
with document.thumbnail_file as file_handle:
raw_thumb = GnuPG.decrypted(file_handle, passphrase)
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
ext: str = Path(document.filename).suffix
if not ext == ".gpg":
raise CommandError(
f"Abort: encrypted file {document.source_path} does not "
f"end with .gpg",
)
document.filename = Path(document.filename).stem
with document.source_path.open("wb") as f:
f.write(raw_document)
with document.thumbnail_path.open("wb") as f:
f.write(raw_thumb)
Document.objects.filter(id=document.id).update(
storage_type=document.storage_type,
filename=document.filename,
)
for path in old_paths:
path.unlink()

View File

@@ -0,0 +1,94 @@
import logging
import multiprocessing
import tqdm
from django import db
from django.conf import settings
from django.core.management.base import BaseCommand
from documents.management.commands.mixins import MultiProcessMixin
from documents.management.commands.mixins import ProgressBarMixin
from documents.models import Document
from documents.tasks import update_document_content_maybe_archive_file
logger = logging.getLogger("paperless.management.archiver")
class Command(MultiProcessMixin, ProgressBarMixin, BaseCommand):
help = (
"Using the current classification model, assigns correspondents, tags "
"and document types to all documents, effectively allowing you to "
"back-tag all previously indexed documents with metadata created (or "
"modified) after their initial import."
)
def add_arguments(self, parser):
parser.add_argument(
"-f",
"--overwrite",
default=False,
action="store_true",
help=(
"Recreates the archived document for documents that already "
"have an archived version."
),
)
parser.add_argument(
"-d",
"--document",
default=None,
type=int,
required=False,
help=(
"Specify the ID of a document, and this command will only "
"run on this specific document."
),
)
self.add_argument_progress_bar_mixin(parser)
self.add_argument_processes_mixin(parser)
def handle(self, *args, **options):
self.handle_processes_mixin(**options)
self.handle_progress_bar_mixin(**options)
settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True)
overwrite = options["overwrite"]
if options["document"]:
documents = Document.objects.filter(pk=options["document"])
else:
documents = Document.objects.all()
document_ids = list(
map(
lambda doc: doc.id,
filter(lambda d: overwrite or not d.has_archive_version, documents),
),
)
# Note to future self: this prevents django from reusing database
# connections between processes, which is bad and does not work
# with postgres.
db.connections.close_all()
try:
logging.getLogger().handlers[0].level = logging.ERROR
if self.process_count == 1:
for doc_id in document_ids:
update_document_content_maybe_archive_file(doc_id)
else: # pragma: no cover
with multiprocessing.Pool(self.process_count) as pool:
list(
tqdm.tqdm(
pool.imap_unordered(
update_document_content_maybe_archive_file,
document_ids,
),
total=len(document_ids),
disable=self.no_progress_bar,
),
)
except KeyboardInterrupt:
self.stdout.write(self.style.NOTICE("Aborting..."))

View File

@@ -0,0 +1,365 @@
import logging
import os
from concurrent.futures import ThreadPoolExecutor
from fnmatch import filter
from pathlib import Path
from pathlib import PurePath
from threading import Event
from time import monotonic
from time import sleep
from typing import Final
from django import db
from django.conf import settings
from django.core.management.base import BaseCommand
from django.core.management.base import CommandError
from watchdog.events import FileSystemEventHandler
from watchdog.observers.polling import PollingObserver
from documents.data_models import ConsumableDocument
from documents.data_models import DocumentMetadataOverrides
from documents.data_models import DocumentSource
from documents.models import Tag
from documents.parsers import is_file_ext_supported
from documents.tasks import consume_file
try:
from inotifyrecursive import INotify
from inotifyrecursive import flags
except ImportError: # pragma: no cover
INotify = flags = None
logger = logging.getLogger("paperless.management.consumer")
def _tags_from_path(filepath: Path) -> list[int]:
"""
Walk up the directory tree from filepath to CONSUMPTION_DIR
and get or create Tag IDs for every directory.
Returns set of Tag models
"""
db.close_old_connections()
tag_ids = set()
path_parts = filepath.relative_to(settings.CONSUMPTION_DIR).parent.parts
for part in path_parts:
tag_ids.add(
Tag.objects.get_or_create(name__iexact=part, defaults={"name": part})[0].pk,
)
return list(tag_ids)
def _is_ignored(filepath: Path) -> bool:
"""
Checks if the given file should be ignored, based on configured
patterns.
Returns True if the file is ignored, False otherwise
"""
# Trim out the consume directory, leaving only filename and it's
# path relative to the consume directory
filepath_relative = PurePath(filepath).relative_to(settings.CONSUMPTION_DIR)
# March through the components of the path, including directories and the filename
# looking for anything matching
# foo/bar/baz/file.pdf -> (foo, bar, baz, file.pdf)
parts = []
for part in filepath_relative.parts:
# If the part is not the name (ie, it's a dir)
# Need to append the trailing slash or fnmatch doesn't match
# fnmatch("dir", "dir/*") == False
# fnmatch("dir/", "dir/*") == True
if part != filepath_relative.name:
part = part + "/"
parts.append(part)
for pattern in settings.CONSUMER_IGNORE_PATTERNS:
if len(filter(parts, pattern)):
return True
return False
def _consume(filepath: Path) -> None:
# Check permissions early
try:
filepath.stat()
except (PermissionError, OSError):
logger.warning(f"Not consuming file {filepath}: Permission denied.")
return
if filepath.is_dir() or _is_ignored(filepath):
return
if not filepath.is_file():
logger.debug(f"Not consuming file {filepath}: File has moved.")
return
if not is_file_ext_supported(filepath.suffix):
logger.warning(f"Not consuming file {filepath}: Unknown file extension.")
return
# Total wait time: up to 500ms
os_error_retry_count: Final[int] = 50
os_error_retry_wait: Final[float] = 0.01
read_try_count = 0
file_open_ok = False
os_error_str = None
while (read_try_count < os_error_retry_count) and not file_open_ok:
try:
with filepath.open("rb"):
file_open_ok = True
except OSError as e:
read_try_count += 1
os_error_str = str(e)
sleep(os_error_retry_wait)
if read_try_count >= os_error_retry_count:
logger.warning(f"Not consuming file {filepath}: OS reports {os_error_str}")
return
tag_ids = None
try:
if settings.CONSUMER_SUBDIRS_AS_TAGS:
tag_ids = _tags_from_path(filepath)
except Exception:
logger.exception("Error creating tags from path")
try:
logger.info(f"Adding {filepath} to the task queue.")
consume_file.delay(
ConsumableDocument(
source=DocumentSource.ConsumeFolder,
original_file=filepath,
),
DocumentMetadataOverrides(tag_ids=tag_ids),
)
except Exception:
# Catch all so that the consumer won't crash.
# This is also what the test case is listening for to check for
# errors.
logger.exception("Error while consuming document")
def _consume_wait_unmodified(file: Path) -> None:
"""
Waits for the given file to appear unmodified based on file size
and modification time. Will wait a configured number of seconds
and retry a configured number of times before either consuming or
giving up
"""
if _is_ignored(file):
return
logger.debug(f"Waiting for file {file} to remain unmodified")
mtime = -1
size = -1
current_try = 0
while current_try < settings.CONSUMER_POLLING_RETRY_COUNT:
try:
stat_data = file.stat()
new_mtime = stat_data.st_mtime
new_size = stat_data.st_size
except FileNotFoundError:
logger.debug(
f"File {file} moved while waiting for it to remain unmodified.",
)
return
if new_mtime == mtime and new_size == size:
_consume(file)
return
mtime = new_mtime
size = new_size
sleep(settings.CONSUMER_POLLING_DELAY)
current_try += 1
logger.error(f"Timeout while waiting on file {file} to remain unmodified.")
class Handler(FileSystemEventHandler):
def __init__(self, pool: ThreadPoolExecutor) -> None:
super().__init__()
self._pool = pool
def on_created(self, event):
self._pool.submit(_consume_wait_unmodified, Path(event.src_path))
def on_moved(self, event):
self._pool.submit(_consume_wait_unmodified, Path(event.dest_path))
class Command(BaseCommand):
"""
On every iteration of an infinite loop, consume what we can from the
consumption directory.
"""
# This is here primarily for the tests and is irrelevant in production.
stop_flag = Event()
# Also only for testing, configures in one place the timeout used before checking
# the stop flag
testing_timeout_s: Final[float] = 0.5
testing_timeout_ms: Final[float] = testing_timeout_s * 1000.0
def add_arguments(self, parser):
parser.add_argument(
"directory",
default=settings.CONSUMPTION_DIR,
nargs="?",
help="The consumption directory.",
)
parser.add_argument("--oneshot", action="store_true", help="Run only once.")
# Only use during unit testing, will configure a timeout
# Leaving it unset or false and the consumer will exit when it
# receives SIGINT
parser.add_argument(
"--testing",
action="store_true",
help="Flag used only for unit testing",
default=False,
)
def handle(self, *args, **options):
directory = options["directory"]
recursive = settings.CONSUMER_RECURSIVE
if not directory:
raise CommandError("CONSUMPTION_DIR does not appear to be set.")
directory = Path(directory).resolve()
if not directory.is_dir():
raise CommandError(f"Consumption directory {directory} does not exist")
# Consumer will need this
settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True)
if recursive:
for dirpath, _, filenames in os.walk(directory):
for filename in filenames:
filepath = Path(dirpath) / filename
_consume(filepath)
else:
for filepath in directory.iterdir():
_consume(filepath)
if options["oneshot"]:
return
if settings.CONSUMER_POLLING == 0 and INotify:
self.handle_inotify(directory, recursive, is_testing=options["testing"])
else:
if INotify is None and settings.CONSUMER_POLLING == 0: # pragma: no cover
logger.warning("Using polling as INotify import failed")
self.handle_polling(directory, recursive, is_testing=options["testing"])
logger.debug("Consumer exiting.")
def handle_polling(self, directory, recursive, *, is_testing: bool):
logger.info(f"Polling directory for changes: {directory}")
timeout = None
if is_testing:
timeout = self.testing_timeout_s
logger.debug(f"Configuring timeout to {timeout}s")
polling_interval = settings.CONSUMER_POLLING
if polling_interval == 0: # pragma: no cover
# Only happens if INotify failed to import
logger.warning("Using polling of 10s, consider setting this")
polling_interval = 10
with ThreadPoolExecutor(max_workers=4) as pool:
observer = PollingObserver(timeout=polling_interval)
observer.schedule(Handler(pool), directory, recursive=recursive)
observer.start()
try:
while observer.is_alive():
observer.join(timeout)
if self.stop_flag.is_set():
observer.stop()
except KeyboardInterrupt:
observer.stop()
observer.join()
def handle_inotify(self, directory, recursive, *, is_testing: bool):
logger.info(f"Using inotify to watch directory for changes: {directory}")
timeout_ms = None
if is_testing:
timeout_ms = self.testing_timeout_ms
logger.debug(f"Configuring timeout to {timeout_ms}ms")
inotify = INotify()
inotify_flags = flags.CLOSE_WRITE | flags.MOVED_TO | flags.MODIFY
if recursive:
inotify.add_watch_recursive(directory, inotify_flags)
else:
inotify.add_watch(directory, inotify_flags)
inotify_debounce_secs: Final[float] = settings.CONSUMER_INOTIFY_DELAY
inotify_debounce_ms: Final[int] = inotify_debounce_secs * 1000
finished = False
notified_files = {}
try:
while not finished:
try:
for event in inotify.read(timeout=timeout_ms):
path = inotify.get_path(event.wd) if recursive else directory
filepath = Path(path) / event.name
if flags.MODIFY in flags.from_mask(event.mask):
notified_files.pop(filepath, None)
else:
notified_files[filepath] = monotonic()
# Check the files against the timeout
still_waiting = {}
# last_event_time is time of the last inotify event for this file
for filepath, last_event_time in notified_files.items():
# Current time - last time over the configured timeout
waited_long_enough = (
monotonic() - last_event_time
) > inotify_debounce_secs
# Also make sure the file exists still, some scanners might write a
# temporary file first
try:
file_still_exists = filepath.exists() and filepath.is_file()
except (PermissionError, OSError): # pragma: no cover
# If we can't check, let it fail in the _consume function
file_still_exists = True
continue
if waited_long_enough and file_still_exists:
_consume(filepath)
elif file_still_exists:
still_waiting[filepath] = last_event_time
# These files are still waiting to hit the timeout
notified_files = still_waiting
# If files are waiting, need to exit read() to check them
# Otherwise, go back to infinite sleep time, but only if not testing
if len(notified_files) > 0:
timeout_ms = inotify_debounce_ms
elif is_testing:
timeout_ms = self.testing_timeout_ms
else:
timeout_ms = None
if self.stop_flag.is_set():
logger.debug("Finishing because event is set")
finished = True
except KeyboardInterrupt:
logger.info("Received SIGINT, stopping inotify")
finished = True
finally:
inotify.close()

View File

@@ -0,0 +1,13 @@
from django.core.management.base import BaseCommand
from documents.tasks import train_classifier
class Command(BaseCommand):
help = (
"Trains the classifier on your data and saves the resulting models to a "
"file. The document consumer will then automatically use this new model."
)
def handle(self, *args, **options):
train_classifier(scheduled=False)

View File

@@ -0,0 +1,613 @@
import hashlib
import json
import os
import shutil
import tempfile
import time
from pathlib import Path
from typing import TYPE_CHECKING
import tqdm
from allauth.mfa.models import Authenticator
from allauth.socialaccount.models import SocialAccount
from allauth.socialaccount.models import SocialApp
from allauth.socialaccount.models import SocialToken
from django.conf import settings
from django.contrib.auth.models import Group
from django.contrib.auth.models import Permission
from django.contrib.auth.models import User
from django.contrib.contenttypes.models import ContentType
from django.core import serializers
from django.core.management.base import BaseCommand
from django.core.management.base import CommandError
from django.db import transaction
from django.utils import timezone
from filelock import FileLock
from guardian.models import GroupObjectPermission
from guardian.models import UserObjectPermission
if TYPE_CHECKING:
from django.db.models import QuerySet
if settings.AUDIT_LOG_ENABLED:
from auditlog.models import LogEntry
from documents.file_handling import delete_empty_directories
from documents.file_handling import generate_filename
from documents.management.commands.mixins import CryptMixin
from documents.models import Correspondent
from documents.models import CustomField
from documents.models import CustomFieldInstance
from documents.models import Document
from documents.models import DocumentType
from documents.models import Note
from documents.models import SavedView
from documents.models import SavedViewFilterRule
from documents.models import StoragePath
from documents.models import Tag
from documents.models import UiSettings
from documents.models import Workflow
from documents.models import WorkflowAction
from documents.models import WorkflowActionEmail
from documents.models import WorkflowActionWebhook
from documents.models import WorkflowTrigger
from documents.settings import EXPORTER_ARCHIVE_NAME
from documents.settings import EXPORTER_FILE_NAME
from documents.settings import EXPORTER_THUMBNAIL_NAME
from documents.utils import copy_file_with_basic_stats
from paperless import version
from paperless.db import GnuPG
from paperless.models import ApplicationConfiguration
from paperless_mail.models import MailAccount
from paperless_mail.models import MailRule
class Command(CryptMixin, BaseCommand):
help = (
"Decrypt and rename all files in our collection into a given target "
"directory. And include a manifest file containing document data for "
"easy import."
)
def add_arguments(self, parser):
parser.add_argument("target")
parser.add_argument(
"-c",
"--compare-checksums",
default=False,
action="store_true",
help=(
"Compare file checksums when determining whether to export "
"a file or not. If not specified, file size and time "
"modified is used instead."
),
)
parser.add_argument(
"-cj",
"--compare-json",
default=False,
action="store_true",
help=(
"Compare json file checksums when determining whether to "
"export a json file or not (manifest or metadata). "
"If not specified, the file is always exported."
),
)
parser.add_argument(
"-d",
"--delete",
default=False,
action="store_true",
help=(
"After exporting, delete files in the export directory that "
"do not belong to the current export, such as files from "
"deleted documents."
),
)
parser.add_argument(
"-f",
"--use-filename-format",
default=False,
action="store_true",
help=(
"Use PAPERLESS_FILENAME_FORMAT for storing files in the "
"export directory, if configured."
),
)
parser.add_argument(
"-na",
"--no-archive",
default=False,
action="store_true",
help="Avoid exporting archive files",
)
parser.add_argument(
"-nt",
"--no-thumbnail",
default=False,
action="store_true",
help="Avoid exporting thumbnail files",
)
parser.add_argument(
"-p",
"--use-folder-prefix",
default=False,
action="store_true",
help=(
"Export files in dedicated folders according to their nature: "
"archive, originals or thumbnails"
),
)
parser.add_argument(
"-sm",
"--split-manifest",
default=False,
action="store_true",
help="Export document information in individual manifest json files.",
)
parser.add_argument(
"-z",
"--zip",
default=False,
action="store_true",
help="Export the documents to a zip file in the given directory",
)
parser.add_argument(
"-zn",
"--zip-name",
default=f"export-{timezone.localdate().isoformat()}",
help="Sets the export zip file name",
)
parser.add_argument(
"--data-only",
default=False,
action="store_true",
help="If set, only the database will be imported, not files",
)
parser.add_argument(
"--no-progress-bar",
default=False,
action="store_true",
help="If set, the progress bar will not be shown",
)
parser.add_argument(
"--passphrase",
help="If provided, is used to encrypt sensitive data in the export",
)
def handle(self, *args, **options):
self.target = Path(options["target"]).resolve()
self.split_manifest: bool = options["split_manifest"]
self.compare_checksums: bool = options["compare_checksums"]
self.compare_json: bool = options["compare_json"]
self.use_filename_format: bool = options["use_filename_format"]
self.use_folder_prefix: bool = options["use_folder_prefix"]
self.delete: bool = options["delete"]
self.no_archive: bool = options["no_archive"]
self.no_thumbnail: bool = options["no_thumbnail"]
self.zip_export: bool = options["zip"]
self.data_only: bool = options["data_only"]
self.no_progress_bar: bool = options["no_progress_bar"]
self.passphrase: str | None = options.get("passphrase")
self.files_in_export_dir: set[Path] = set()
self.exported_files: set[str] = set()
# If zipping, save the original target for later and
# get a temporary directory for the target instead
temp_dir = None
self.original_target = self.target
if self.zip_export:
settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True)
temp_dir = tempfile.TemporaryDirectory(
dir=settings.SCRATCH_DIR,
prefix="paperless-export",
)
self.target = Path(temp_dir.name).resolve()
if not self.target.exists():
raise CommandError("That path doesn't exist")
if not self.target.is_dir():
raise CommandError("That path isn't a directory")
if not os.access(self.target, os.W_OK):
raise CommandError("That path doesn't appear to be writable")
try:
# Prevent any ongoing changes in the documents
with FileLock(settings.MEDIA_LOCK):
self.dump()
# We've written everything to the temporary directory in this case,
# now make an archive in the original target, with all files stored
if self.zip_export and temp_dir is not None:
shutil.make_archive(
self.original_target / options["zip_name"],
format="zip",
root_dir=temp_dir.name,
)
finally:
# Always cleanup the temporary directory, if one was created
if self.zip_export and temp_dir is not None:
temp_dir.cleanup()
def dump(self):
# 1. Take a snapshot of what files exist in the current export folder
for x in self.target.glob("**/*"):
if x.is_file():
self.files_in_export_dir.add(x.resolve())
# 2. Create manifest, containing all correspondents, types, tags, storage paths
# note, documents and ui_settings
manifest_key_to_object_query: dict[str, QuerySet] = {
"correspondents": Correspondent.objects.all(),
"tags": Tag.objects.all(),
"document_types": DocumentType.objects.all(),
"storage_paths": StoragePath.objects.all(),
"mail_accounts": MailAccount.objects.all(),
"mail_rules": MailRule.objects.all(),
"saved_views": SavedView.objects.all(),
"saved_view_filter_rules": SavedViewFilterRule.objects.all(),
"groups": Group.objects.all(),
"users": User.objects.exclude(
username__in=["consumer", "AnonymousUser"],
).all(),
"ui_settings": UiSettings.objects.all(),
"content_types": ContentType.objects.all(),
"permissions": Permission.objects.all(),
"user_object_permissions": UserObjectPermission.objects.all(),
"group_object_permissions": GroupObjectPermission.objects.all(),
"workflow_triggers": WorkflowTrigger.objects.all(),
"workflow_actions": WorkflowAction.objects.all(),
"workflow_email_actions": WorkflowActionEmail.objects.all(),
"workflow_webhook_actions": WorkflowActionWebhook.objects.all(),
"workflows": Workflow.objects.all(),
"custom_fields": CustomField.objects.all(),
"custom_field_instances": CustomFieldInstance.objects.all(),
"app_configs": ApplicationConfiguration.objects.all(),
"notes": Note.objects.all(),
"documents": Document.objects.order_by("id").all(),
"social_accounts": SocialAccount.objects.all(),
"social_apps": SocialApp.objects.all(),
"social_tokens": SocialToken.objects.all(),
"authenticators": Authenticator.objects.all(),
}
if settings.AUDIT_LOG_ENABLED:
manifest_key_to_object_query["log_entries"] = LogEntry.objects.all()
with transaction.atomic():
manifest_dict = {}
# Build an overall manifest
for key, object_query in manifest_key_to_object_query.items():
manifest_dict[key] = json.loads(
serializers.serialize("json", object_query),
)
self.encrypt_secret_fields(manifest_dict)
# These are treated specially and included in the per-document manifest
# if that setting is enabled. Otherwise, they are just exported to the bulk
# manifest
document_map: dict[int, Document] = {
d.pk: d for d in manifest_key_to_object_query["documents"]
}
document_manifest = manifest_dict["documents"]
# 3. Export files from each document
for index, document_dict in tqdm.tqdm(
enumerate(document_manifest),
total=len(document_manifest),
disable=self.no_progress_bar,
):
# 3.1. store files unencrypted
document_dict["fields"]["storage_type"] = Document.STORAGE_TYPE_UNENCRYPTED
document = document_map[document_dict["pk"]]
# 3.2. generate a unique filename
base_name = self.generate_base_name(document)
# 3.3. write filenames into manifest
original_target, thumbnail_target, archive_target = (
self.generate_document_targets(document, base_name, document_dict)
)
# 3.4. write files to target folder
if not self.data_only:
self.copy_document_files(
document,
original_target,
thumbnail_target,
archive_target,
)
if self.split_manifest:
manifest_name = base_name.with_name(f"{base_name.stem}-manifest.json")
if self.use_folder_prefix:
manifest_name = Path("json") / manifest_name
manifest_name = (self.target / manifest_name).resolve()
manifest_name.parent.mkdir(parents=True, exist_ok=True)
content = [document_manifest[index]]
content += list(
filter(
lambda d: d["fields"]["document"] == document_dict["pk"],
manifest_dict["notes"],
),
)
content += list(
filter(
lambda d: d["fields"]["document"] == document_dict["pk"],
manifest_dict["custom_field_instances"],
),
)
self.check_and_write_json(
content,
manifest_name,
)
# These were exported already
if self.split_manifest:
del manifest_dict["documents"]
del manifest_dict["notes"]
del manifest_dict["custom_field_instances"]
# 4.1 write primary manifest to target folder
manifest = []
for key, item in manifest_dict.items():
manifest.extend(item)
manifest_path = (self.target / "manifest.json").resolve()
self.check_and_write_json(
manifest,
manifest_path,
)
# 4.2 write version information to target folder
extra_metadata_path = (self.target / "metadata.json").resolve()
metadata: dict[str, str | int | dict[str, str | int]] = {
"version": version.__full_version_str__,
}
# 4.2.1 If needed, write the crypto values into the metadata
# Django stores most of these in the field itself, we store them once here
if self.passphrase:
metadata.update(self.get_crypt_params())
self.check_and_write_json(
metadata,
extra_metadata_path,
)
if self.delete:
# 5. Remove files which we did not explicitly export in this run
if not self.zip_export:
for f in self.files_in_export_dir:
f.unlink()
delete_empty_directories(
f.parent,
self.target,
)
else:
# 5. Remove anything in the original location (before moving the zip)
for item in self.original_target.glob("*"):
if item.is_dir():
shutil.rmtree(item)
else:
item.unlink()
def generate_base_name(self, document: Document) -> Path:
"""
Generates a unique name for the document, one which hasn't already been exported (or will be)
"""
filename_counter = 0
while True:
if self.use_filename_format:
base_name = generate_filename(
document,
counter=filename_counter,
append_gpg=False,
)
else:
base_name = document.get_public_filename(counter=filename_counter)
if base_name not in self.exported_files:
self.exported_files.add(base_name)
break
else:
filename_counter += 1
return Path(base_name)
def generate_document_targets(
self,
document: Document,
base_name: Path,
document_dict: dict,
) -> tuple[Path, Path | None, Path | None]:
"""
Generates the targets for a given document, including the original file, archive file and thumbnail (depending on settings).
"""
original_name = base_name
if self.use_folder_prefix:
original_name = Path("originals") / original_name
original_target = (self.target / original_name).resolve()
document_dict[EXPORTER_FILE_NAME] = str(original_name)
if not self.no_thumbnail:
thumbnail_name = base_name.parent / (base_name.stem + "-thumbnail.webp")
if self.use_folder_prefix:
thumbnail_name = Path("thumbnails") / thumbnail_name
thumbnail_target = (self.target / thumbnail_name).resolve()
document_dict[EXPORTER_THUMBNAIL_NAME] = str(thumbnail_name)
else:
thumbnail_target = None
if not self.no_archive and document.has_archive_version:
archive_name = base_name.parent / (base_name.stem + "-archive.pdf")
if self.use_folder_prefix:
archive_name = Path("archive") / archive_name
archive_target = (self.target / archive_name).resolve()
document_dict[EXPORTER_ARCHIVE_NAME] = str(archive_name)
else:
archive_target = None
return original_target, thumbnail_target, archive_target
def copy_document_files(
self,
document: Document,
original_target: Path,
thumbnail_target: Path | None,
archive_target: Path | None,
) -> None:
"""
Copies files from the document storage location to the specified target location.
If the document is encrypted, the files are decrypted before copying them to the target location.
"""
if document.storage_type == Document.STORAGE_TYPE_GPG:
t = int(time.mktime(document.created.timetuple()))
original_target.parent.mkdir(parents=True, exist_ok=True)
with document.source_file as out_file:
original_target.write_bytes(GnuPG.decrypted(out_file))
os.utime(original_target, times=(t, t))
if thumbnail_target:
thumbnail_target.parent.mkdir(parents=True, exist_ok=True)
with document.thumbnail_file as out_file:
thumbnail_target.write_bytes(GnuPG.decrypted(out_file))
os.utime(thumbnail_target, times=(t, t))
if archive_target:
archive_target.parent.mkdir(parents=True, exist_ok=True)
if TYPE_CHECKING:
assert isinstance(document.archive_path, Path)
with document.archive_path as out_file:
archive_target.write_bytes(GnuPG.decrypted(out_file))
os.utime(archive_target, times=(t, t))
else:
self.check_and_copy(
document.source_path,
document.checksum,
original_target,
)
if thumbnail_target:
self.check_and_copy(document.thumbnail_path, None, thumbnail_target)
if archive_target:
if TYPE_CHECKING:
assert isinstance(document.archive_path, Path)
self.check_and_copy(
document.archive_path,
document.archive_checksum,
archive_target,
)
def check_and_write_json(
self,
content: list[dict] | dict,
target: Path,
):
"""
Writes the source content to the target json file.
If --compare-json arg was used, don't write to target file if
the file exists and checksum is identical to content checksum.
This preserves the file timestamps when no changes are made.
"""
target = target.resolve()
perform_write = True
if target in self.files_in_export_dir:
self.files_in_export_dir.remove(target)
if self.compare_json:
target_checksum = hashlib.md5(target.read_bytes()).hexdigest()
src_str = json.dumps(content, indent=2, ensure_ascii=False)
src_checksum = hashlib.md5(src_str.encode("utf-8")).hexdigest()
if src_checksum == target_checksum:
perform_write = False
if perform_write:
target.write_text(
json.dumps(content, indent=2, ensure_ascii=False),
encoding="utf-8",
)
def check_and_copy(
self,
source: Path,
source_checksum: str | None,
target: Path,
):
"""
Copies the source to the target, if target doesn't exist or the target doesn't seem to match
the source attributes
"""
target = target.resolve()
if target in self.files_in_export_dir:
self.files_in_export_dir.remove(target)
perform_copy = False
if target.exists():
source_stat = source.stat()
target_stat = target.stat()
if self.compare_checksums and source_checksum:
target_checksum = hashlib.md5(target.read_bytes()).hexdigest()
perform_copy = target_checksum != source_checksum
elif (
source_stat.st_mtime != target_stat.st_mtime
or source_stat.st_size != target_stat.st_size
):
perform_copy = True
else:
# Copy if it does not exist
perform_copy = True
if perform_copy:
target.parent.mkdir(parents=True, exist_ok=True)
copy_file_with_basic_stats(source, target)
def encrypt_secret_fields(self, manifest: dict) -> None:
"""
Encrypts certain fields in the export. Currently limited to the mail account password
"""
if self.passphrase:
self.setup_crypto(passphrase=self.passphrase)
for crypt_config in self.CRYPT_FIELDS:
exporter_key = crypt_config["exporter_key"]
crypt_fields = crypt_config["fields"]
for manifest_record in manifest[exporter_key]:
for field in crypt_fields:
if manifest_record["fields"][field]:
manifest_record["fields"][field] = self.encrypt_string(
value=manifest_record["fields"][field],
)
elif MailAccount.objects.count() > 0 or SocialToken.objects.count() > 0:
self.stdout.write(
self.style.NOTICE(
"No passphrase was given, sensitive fields will be in plaintext",
),
)

View File

@@ -0,0 +1,149 @@
import dataclasses
import multiprocessing
from typing import Final
import rapidfuzz
import tqdm
from django.core.management import BaseCommand
from django.core.management import CommandError
from documents.management.commands.mixins import MultiProcessMixin
from documents.management.commands.mixins import ProgressBarMixin
from documents.models import Document
@dataclasses.dataclass(frozen=True)
class _WorkPackage:
first_doc: Document
second_doc: Document
@dataclasses.dataclass(frozen=True)
class _WorkResult:
doc_one_pk: int
doc_two_pk: int
ratio: float
def __lt__(self, other: "_WorkResult") -> bool:
return self.doc_one_pk < other.doc_one_pk
def _process_and_match(work: _WorkPackage) -> _WorkResult:
"""
Does basic processing of document content, gets the basic ratio
and returns the result package
"""
# Normalize the string some, lower case, whitespace, etc
first_string = rapidfuzz.utils.default_process(work.first_doc.content)
second_string = rapidfuzz.utils.default_process(work.second_doc.content)
# Basic matching ratio
match = rapidfuzz.fuzz.ratio(first_string, second_string)
return _WorkResult(work.first_doc.pk, work.second_doc.pk, match)
class Command(MultiProcessMixin, ProgressBarMixin, BaseCommand):
help = "Searches for documents where the content almost matches"
def add_arguments(self, parser):
parser.add_argument(
"--ratio",
default=85.0,
type=float,
help="Ratio to consider documents a match",
)
parser.add_argument(
"--delete",
default=False,
action="store_true",
help="If set, one document of matches above the ratio WILL BE DELETED",
)
self.add_argument_progress_bar_mixin(parser)
self.add_argument_processes_mixin(parser)
def handle(self, *args, **options):
RATIO_MIN: Final[float] = 0.0
RATIO_MAX: Final[float] = 100.0
self.handle_processes_mixin(**options)
self.handle_progress_bar_mixin(**options)
if options["delete"]:
self.stdout.write(
self.style.WARNING(
"The command is configured to delete documents. Use with caution",
),
)
opt_ratio = options["ratio"]
checked_pairs: set[tuple[int, int]] = set()
work_pkgs: list[_WorkPackage] = []
# Ratio is a float from 0.0 to 100.0
if opt_ratio < RATIO_MIN or opt_ratio > RATIO_MAX:
raise CommandError("The ratio must be between 0 and 100")
all_docs = Document.objects.all().order_by("id")
# Build work packages for processing
for first_doc in all_docs:
for second_doc in all_docs:
# doc to doc is obviously not useful
if first_doc.pk == second_doc.pk:
continue
# Skip empty documents (e.g. password-protected)
if first_doc.content.strip() == "" or second_doc.content.strip() == "":
continue
# Skip matching which have already been matched together
# doc 1 to doc 2 is the same as doc 2 to doc 1
doc_1_to_doc_2 = (first_doc.pk, second_doc.pk)
doc_2_to_doc_1 = doc_1_to_doc_2[::-1]
if doc_1_to_doc_2 in checked_pairs or doc_2_to_doc_1 in checked_pairs:
continue
checked_pairs.update([doc_1_to_doc_2, doc_2_to_doc_1])
# Actually something useful to work on now
work_pkgs.append(_WorkPackage(first_doc, second_doc))
# Don't spin up a pool of 1 process
if self.process_count == 1:
results = []
for work in tqdm.tqdm(work_pkgs, disable=self.no_progress_bar):
results.append(_process_and_match(work))
else: # pragma: no cover
with multiprocessing.Pool(processes=self.process_count) as pool:
results = list(
tqdm.tqdm(
pool.imap_unordered(_process_and_match, work_pkgs),
total=len(work_pkgs),
disable=self.no_progress_bar,
),
)
# Check results
messages = []
maybe_delete_ids = []
for result in sorted(results):
if result.ratio >= opt_ratio:
messages.append(
self.style.NOTICE(
f"Document {result.doc_one_pk} fuzzy match"
f" to {result.doc_two_pk} (confidence {result.ratio:.3f})\n",
),
)
maybe_delete_ids.append(result.doc_two_pk)
if len(messages) == 0:
messages.append(
self.style.SUCCESS("No matches found\n"),
)
self.stdout.writelines(
messages,
)
if options["delete"]:
self.stdout.write(
self.style.NOTICE(
f"Deleting {len(maybe_delete_ids)} documents based on ratio matches",
),
)
Document.objects.filter(pk__in=maybe_delete_ids).delete()

View File

@@ -0,0 +1,448 @@
import json
import logging
import os
import tempfile
from collections.abc import Generator
from contextlib import contextmanager
from pathlib import Path
from zipfile import ZipFile
from zipfile import is_zipfile
import tqdm
from django.conf import settings
from django.contrib.auth.models import Permission
from django.contrib.auth.models import User
from django.contrib.contenttypes.models import ContentType
from django.core.exceptions import FieldDoesNotExist
from django.core.management import call_command
from django.core.management.base import BaseCommand
from django.core.management.base import CommandError
from django.core.serializers.base import DeserializationError
from django.db import IntegrityError
from django.db import transaction
from django.db.models.signals import m2m_changed
from django.db.models.signals import post_save
from filelock import FileLock
from documents.file_handling import create_source_path_directory
from documents.management.commands.mixins import CryptMixin
from documents.models import Correspondent
from documents.models import CustomField
from documents.models import CustomFieldInstance
from documents.models import Document
from documents.models import DocumentType
from documents.models import Note
from documents.models import Tag
from documents.parsers import run_convert
from documents.settings import EXPORTER_ARCHIVE_NAME
from documents.settings import EXPORTER_CRYPTO_SETTINGS_NAME
from documents.settings import EXPORTER_FILE_NAME
from documents.settings import EXPORTER_THUMBNAIL_NAME
from documents.signals.handlers import check_paths_and_prune_custom_fields
from documents.signals.handlers import update_filename_and_move_files
from documents.utils import copy_file_with_basic_stats
from paperless import version
if settings.AUDIT_LOG_ENABLED:
from auditlog.registry import auditlog
@contextmanager
def disable_signal(sig, receiver, sender) -> Generator:
try:
sig.disconnect(receiver=receiver, sender=sender)
yield
finally:
sig.connect(receiver=receiver, sender=sender)
class Command(CryptMixin, BaseCommand):
help = (
"Using a manifest.json file, load the data from there, and import the "
"documents it refers to."
)
def add_arguments(self, parser) -> None:
parser.add_argument("source")
parser.add_argument(
"--no-progress-bar",
default=False,
action="store_true",
help="If set, the progress bar will not be shown",
)
parser.add_argument(
"--data-only",
default=False,
action="store_true",
help="If set, only the database will be exported, not files",
)
parser.add_argument(
"--passphrase",
help="If provided, is used to sensitive fields in the export",
)
def pre_check(self) -> None:
"""
Runs some initial checks against the state of the install and source, including:
- Does the target exist?
- Can we access the target?
- Does the target have a manifest file?
- Are there existing files in the document folders?
- Are there existing users or documents in the database?
"""
def pre_check_maybe_not_empty() -> None:
# Skip this check if operating only on the database
# We can expect data to exist in that case
if not self.data_only:
for document_dir in [settings.ORIGINALS_DIR, settings.ARCHIVE_DIR]:
if document_dir.exists() and document_dir.is_dir():
for entry in document_dir.glob("**/*"):
if entry.is_dir():
continue
self.stdout.write(
self.style.WARNING(
f"Found file {entry.relative_to(document_dir)}, this might indicate a non-empty installation",
),
)
break
# But existing users or other data still matters in a data only
if (
User.objects.exclude(username__in=["consumer", "AnonymousUser"]).count()
!= 0
):
self.stdout.write(
self.style.WARNING(
"Found existing user(s), this might indicate a non-empty installation",
),
)
if Document.objects.count() != 0:
self.stdout.write(
self.style.WARNING(
"Found existing documents(s), this might indicate a non-empty installation",
),
)
def pre_check_manifest_exists() -> None:
if not (self.source / "manifest.json").exists():
raise CommandError(
"That directory doesn't appear to contain a manifest.json file.",
)
if not self.source.exists():
raise CommandError("That path doesn't exist")
if not os.access(self.source, os.R_OK):
raise CommandError("That path doesn't appear to be readable")
pre_check_maybe_not_empty()
pre_check_manifest_exists()
def load_manifest_files(self) -> None:
"""
Loads manifest data from the various JSON files for parsing and loading the database
"""
main_manifest_path: Path = self.source / "manifest.json"
with main_manifest_path.open() as infile:
self.manifest = json.load(infile)
self.manifest_paths.append(main_manifest_path)
for file in Path(self.source).glob("**/*-manifest.json"):
with file.open() as infile:
self.manifest += json.load(infile)
self.manifest_paths.append(file)
def load_metadata(self) -> None:
"""
Loads either just the version information or the version information and extra data
Must account for the old style of export as well, with just version.json
"""
version_path: Path = self.source / "version.json"
metadata_path: Path = self.source / "metadata.json"
if not version_path.exists() and not metadata_path.exists():
self.stdout.write(
self.style.NOTICE("No version.json or metadata.json file located"),
)
return
if metadata_path.exists():
with metadata_path.open() as infile:
data = json.load(infile)
self.version = data["version"]
if not self.passphrase and EXPORTER_CRYPTO_SETTINGS_NAME in data:
raise CommandError(
"No passphrase was given, but this export contains encrypted fields",
)
elif EXPORTER_CRYPTO_SETTINGS_NAME in data:
self.load_crypt_params(data)
elif version_path.exists():
with version_path.open() as infile:
self.version = json.load(infile)["version"]
if self.version and self.version != version.__full_version_str__:
self.stdout.write(
self.style.WARNING(
"Version mismatch: "
f"Currently {version.__full_version_str__},"
f" importing {self.version}."
" Continuing, but import may fail.",
),
)
def load_data_to_database(self) -> None:
"""
As the name implies, loads data from the JSON file(s) into the database
"""
try:
with transaction.atomic():
# delete these since pk can change, re-created from import
ContentType.objects.all().delete()
Permission.objects.all().delete()
for manifest_path in self.manifest_paths:
call_command("loaddata", manifest_path)
except (FieldDoesNotExist, DeserializationError, IntegrityError) as e:
self.stdout.write(self.style.ERROR("Database import failed"))
if (
self.version is not None
and self.version != version.__full_version_str__
): # pragma: no cover
self.stdout.write(
self.style.ERROR(
"Version mismatch: "
f"Currently {version.__full_version_str__},"
f" importing {self.version}",
),
)
raise e
else:
self.stdout.write(
self.style.ERROR("No version information present"),
)
raise e
def handle(self, *args, **options) -> None:
logging.getLogger().handlers[0].level = logging.ERROR
self.source = Path(options["source"]).resolve()
self.data_only: bool = options["data_only"]
self.no_progress_bar: bool = options["no_progress_bar"]
self.passphrase: str | None = options.get("passphrase")
self.version: str | None = None
self.salt: str | None = None
self.manifest_paths = []
self.manifest = []
# Create a temporary directory for extracting a zip file into it, even if supplied source is no zip file to keep code cleaner.
with tempfile.TemporaryDirectory() as tmp_dir:
if is_zipfile(self.source):
with ZipFile(self.source) as zf:
zf.extractall(tmp_dir)
self.source = Path(tmp_dir)
self._run_import()
def _run_import(self):
self.pre_check()
self.load_metadata()
self.load_manifest_files()
self.check_manifest_validity()
self.decrypt_secret_fields()
# see /src/documents/signals/handlers.py
with (
disable_signal(
post_save,
receiver=update_filename_and_move_files,
sender=Document,
),
disable_signal(
m2m_changed,
receiver=update_filename_and_move_files,
sender=Document.tags.through,
),
disable_signal(
post_save,
receiver=update_filename_and_move_files,
sender=CustomFieldInstance,
),
disable_signal(
post_save,
receiver=check_paths_and_prune_custom_fields,
sender=CustomField,
),
):
if settings.AUDIT_LOG_ENABLED:
auditlog.unregister(Document)
auditlog.unregister(Correspondent)
auditlog.unregister(Tag)
auditlog.unregister(DocumentType)
auditlog.unregister(Note)
auditlog.unregister(CustomField)
auditlog.unregister(CustomFieldInstance)
# Fill up the database with whatever is in the manifest
self.load_data_to_database()
if not self.data_only:
self._import_files_from_manifest()
else:
self.stdout.write(self.style.NOTICE("Data only import completed"))
self.stdout.write("Updating search index...")
call_command(
"document_index",
"reindex",
no_progress_bar=self.no_progress_bar,
)
def check_manifest_validity(self) -> None:
"""
Attempts to verify the manifest is valid. Namely checking the files
referred to exist and the files can be read from
"""
def check_document_validity(document_record: dict) -> None:
if EXPORTER_FILE_NAME not in document_record:
raise CommandError(
"The manifest file contains a record which does not "
"refer to an actual document file.",
)
doc_file = document_record[EXPORTER_FILE_NAME]
doc_path: Path = self.source / doc_file
if not doc_path.exists():
raise CommandError(
f'The manifest file refers to "{doc_file}" which does not '
"appear to be in the source directory.",
)
try:
with doc_path.open(mode="rb"):
pass
except Exception as e:
raise CommandError(
f"Failed to read from original file {doc_path}",
) from e
if EXPORTER_ARCHIVE_NAME in document_record:
archive_file = document_record[EXPORTER_ARCHIVE_NAME]
doc_archive_path: Path = self.source / archive_file
if not doc_archive_path.exists():
raise CommandError(
f"The manifest file refers to {archive_file} which "
f"does not appear to be in the source directory.",
)
try:
with doc_archive_path.open(mode="rb"):
pass
except Exception as e:
raise CommandError(
f"Failed to read from archive file {doc_archive_path}",
) from e
self.stdout.write("Checking the manifest")
for record in self.manifest:
# Only check if the document files exist if this is not data only
# We don't care about documents for a data only import
if not self.data_only and record["model"] == "documents.document":
check_document_validity(record)
def _import_files_from_manifest(self) -> None:
settings.ORIGINALS_DIR.mkdir(parents=True, exist_ok=True)
settings.THUMBNAIL_DIR.mkdir(parents=True, exist_ok=True)
settings.ARCHIVE_DIR.mkdir(parents=True, exist_ok=True)
self.stdout.write("Copy files into paperless...")
manifest_documents = list(
filter(lambda r: r["model"] == "documents.document", self.manifest),
)
for record in tqdm.tqdm(manifest_documents, disable=self.no_progress_bar):
document = Document.objects.get(pk=record["pk"])
doc_file = record[EXPORTER_FILE_NAME]
document_path = self.source / doc_file
if EXPORTER_THUMBNAIL_NAME in record:
thumb_file = record[EXPORTER_THUMBNAIL_NAME]
thumbnail_path = (self.source / thumb_file).resolve()
else:
thumbnail_path = None
if EXPORTER_ARCHIVE_NAME in record:
archive_file = record[EXPORTER_ARCHIVE_NAME]
archive_path = self.source / archive_file
else:
archive_path = None
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
with FileLock(settings.MEDIA_LOCK):
if Path(document.source_path).is_file():
raise FileExistsError(document.source_path)
create_source_path_directory(document.source_path)
copy_file_with_basic_stats(document_path, document.source_path)
if thumbnail_path:
if thumbnail_path.suffix in {".png", ".PNG"}:
run_convert(
density=300,
scale="500x5000>",
alpha="remove",
strip=True,
trim=False,
auto_orient=True,
input_file=f"{thumbnail_path}[0]",
output_file=str(document.thumbnail_path),
)
else:
copy_file_with_basic_stats(
thumbnail_path,
document.thumbnail_path,
)
if archive_path:
create_source_path_directory(document.archive_path)
# TODO: this assumes that the export is valid and
# archive_filename is present on all documents with
# archived files
copy_file_with_basic_stats(archive_path, document.archive_path)
document.save()
def decrypt_secret_fields(self) -> None:
"""
The converse decryption of some fields out of the export before importing to database
"""
if self.passphrase:
# Salt has been loaded from metadata.json at this point, so it cannot be None
self.setup_crypto(passphrase=self.passphrase, salt=self.salt)
had_at_least_one_record = False
for crypt_config in self.CRYPT_FIELDS:
importer_model: str = crypt_config["model_name"]
crypt_fields: str = crypt_config["fields"]
for record in filter(
lambda x: x["model"] == importer_model,
self.manifest,
):
had_at_least_one_record = True
for field in crypt_fields:
if record["fields"][field]:
record["fields"][field] = self.decrypt_string(
value=record["fields"][field],
)
if had_at_least_one_record:
# It's annoying, but the DB is loaded from the JSON directly
# Maybe could change that in the future?
(self.source / "manifest.json").write_text(
json.dumps(self.manifest, indent=2, ensure_ascii=False),
)

View File

@@ -0,0 +1,22 @@
from django.core.management import BaseCommand
from django.db import transaction
from documents.management.commands.mixins import ProgressBarMixin
from documents.tasks import index_optimize
from documents.tasks import index_reindex
class Command(ProgressBarMixin, BaseCommand):
help = "Manages the document index."
def add_arguments(self, parser):
parser.add_argument("command", choices=["reindex", "optimize"])
self.add_argument_progress_bar_mixin(parser)
def handle(self, *args, **options):
self.handle_progress_bar_mixin(**options)
with transaction.atomic():
if options["command"] == "reindex":
index_reindex(progress_bar_disable=self.no_progress_bar)
elif options["command"] == "optimize":
index_optimize()

View File

@@ -0,0 +1,25 @@
import logging
import tqdm
from django.core.management.base import BaseCommand
from django.db.models.signals import post_save
from documents.management.commands.mixins import ProgressBarMixin
from documents.models import Document
class Command(ProgressBarMixin, BaseCommand):
help = "This will rename all documents to match the latest filename format."
def add_arguments(self, parser):
self.add_argument_progress_bar_mixin(parser)
def handle(self, *args, **options):
self.handle_progress_bar_mixin(**options)
logging.getLogger().handlers[0].level = logging.ERROR
for document in tqdm.tqdm(
Document.objects.all(),
disable=self.no_progress_bar,
):
post_save.send(Document, instance=document, created=False)

View File

@@ -0,0 +1,136 @@
import logging
import tqdm
from django.core.management.base import BaseCommand
from documents.classifier import load_classifier
from documents.management.commands.mixins import ProgressBarMixin
from documents.models import Document
from documents.signals.handlers import set_correspondent
from documents.signals.handlers import set_document_type
from documents.signals.handlers import set_storage_path
from documents.signals.handlers import set_tags
logger = logging.getLogger("paperless.management.retagger")
class Command(ProgressBarMixin, BaseCommand):
help = (
"Using the current classification model, assigns correspondents, tags "
"and document types to all documents, effectively allowing you to "
"back-tag all previously indexed documents with metadata created (or "
"modified) after their initial import."
)
def add_arguments(self, parser):
parser.add_argument("-c", "--correspondent", default=False, action="store_true")
parser.add_argument("-T", "--tags", default=False, action="store_true")
parser.add_argument("-t", "--document_type", default=False, action="store_true")
parser.add_argument("-s", "--storage_path", default=False, action="store_true")
parser.add_argument("-i", "--inbox-only", default=False, action="store_true")
parser.add_argument(
"--use-first",
default=False,
action="store_true",
help=(
"By default this command won't try to assign a correspondent "
"if more than one matches the document. Use this flag if "
"you'd rather it just pick the first one it finds."
),
)
parser.add_argument(
"-f",
"--overwrite",
default=False,
action="store_true",
help=(
"If set, the document retagger will overwrite any previously "
"set correspondent, document and remove correspondents, types "
"and tags that do not match anymore due to changed rules."
),
)
self.add_argument_progress_bar_mixin(parser)
parser.add_argument(
"--suggest",
default=False,
action="store_true",
help="Return the suggestion, don't change anything.",
)
parser.add_argument(
"--base-url",
help="The base URL to use to build the link to the documents.",
)
parser.add_argument(
"--id-range",
help="A range of document ids on which the retagging should be applied.",
nargs=2,
type=int,
)
def handle(self, *args, **options):
self.handle_progress_bar_mixin(**options)
if options["inbox_only"]:
queryset = Document.objects.filter(tags__is_inbox_tag=True)
else:
queryset = Document.objects.all()
if options["id_range"]:
queryset = queryset.filter(
id__range=(options["id_range"][0], options["id_range"][1]),
)
documents = queryset.distinct()
classifier = load_classifier()
for document in tqdm.tqdm(documents, disable=self.no_progress_bar):
if options["correspondent"]:
set_correspondent(
sender=None,
document=document,
classifier=classifier,
replace=options["overwrite"],
use_first=options["use_first"],
suggest=options["suggest"],
base_url=options["base_url"],
stdout=self.stdout,
style_func=self.style,
)
if options["document_type"]:
set_document_type(
sender=None,
document=document,
classifier=classifier,
replace=options["overwrite"],
use_first=options["use_first"],
suggest=options["suggest"],
base_url=options["base_url"],
stdout=self.stdout,
style_func=self.style,
)
if options["tags"]:
set_tags(
sender=None,
document=document,
classifier=classifier,
replace=options["overwrite"],
suggest=options["suggest"],
base_url=options["base_url"],
stdout=self.stdout,
style_func=self.style,
)
if options["storage_path"]:
set_storage_path(
sender=None,
document=document,
classifier=classifier,
replace=options["overwrite"],
use_first=options["use_first"],
suggest=options["suggest"],
base_url=options["base_url"],
stdout=self.stdout,
style_func=self.style,
)

View File

@@ -0,0 +1,17 @@
from django.core.management.base import BaseCommand
from documents.management.commands.mixins import ProgressBarMixin
from documents.sanity_checker import check_sanity
class Command(ProgressBarMixin, BaseCommand):
help = "This command checks your document archive for issues."
def add_arguments(self, parser):
self.add_argument_progress_bar_mixin(parser)
def handle(self, *args, **options):
self.handle_progress_bar_mixin(**options)
messages = check_sanity(progress=self.use_progress_bar, scheduled=False)
messages.log_messages()

View File

@@ -0,0 +1,84 @@
import logging
import multiprocessing
import shutil
import tqdm
from django import db
from django.core.management.base import BaseCommand
from documents.management.commands.mixins import MultiProcessMixin
from documents.management.commands.mixins import ProgressBarMixin
from documents.models import Document
from documents.parsers import get_parser_class_for_mime_type
def _process_document(doc_id):
document: Document = Document.objects.get(id=doc_id)
parser_class = get_parser_class_for_mime_type(document.mime_type)
if parser_class:
parser = parser_class(logging_group=None)
else:
print(f"{document} No parser for mime type {document.mime_type}") # noqa: T201
return
try:
thumb = parser.get_thumbnail(
document.source_path,
document.mime_type,
document.get_public_filename(),
)
shutil.move(thumb, document.thumbnail_path)
finally:
parser.cleanup()
class Command(MultiProcessMixin, ProgressBarMixin, BaseCommand):
help = "This will regenerate the thumbnails for all documents."
def add_arguments(self, parser):
parser.add_argument(
"-d",
"--document",
default=None,
type=int,
required=False,
help=(
"Specify the ID of a document, and this command will only "
"run on this specific document."
),
)
self.add_argument_progress_bar_mixin(parser)
self.add_argument_processes_mixin(parser)
def handle(self, *args, **options):
logging.getLogger().handlers[0].level = logging.ERROR
self.handle_processes_mixin(**options)
self.handle_progress_bar_mixin(**options)
if options["document"]:
documents = Document.objects.filter(pk=options["document"])
else:
documents = Document.objects.all()
ids = [doc.id for doc in documents]
# Note to future self: this prevents django from reusing database
# connections between processes, which is bad and does not work
# with postgres.
db.connections.close_all()
if self.process_count == 1:
for doc_id in ids:
_process_document(doc_id)
else: # pragma: no cover
with multiprocessing.Pool(processes=self.process_count) as pool:
list(
tqdm.tqdm(
pool.imap_unordered(_process_document, ids),
total=len(ids),
disable=self.no_progress_bar,
),
)

View File

@@ -0,0 +1,22 @@
import sys
from django.core.management.commands.loaddata import Command as LoadDataCommand
# This class is used to migrate data between databases
# That's difficult to test
class Command(LoadDataCommand): # pragma: no cover
"""
Allow the loading of data from standard in. Sourced originally from:
https://gist.github.com/bmispelon/ad5a2c333443b3a1d051 (MIT licensed)
"""
def parse_name(self, fixture_name):
self.compression_formats["stdin"] = (lambda x, y: sys.stdin, None)
if fixture_name == "-":
return "-", "json", "stdin"
def find_fixtures(self, fixture_label):
if fixture_label == "-":
return [("-", None, "-")]
return super().find_fixtures(fixture_label)

View File

@@ -0,0 +1,66 @@
import logging
import os
from argparse import RawTextHelpFormatter
from django.contrib.auth.models import User
from django.core.management.base import BaseCommand
logger = logging.getLogger("paperless.management.superuser")
class Command(BaseCommand):
help = (
"Creates a Django superuser:\n"
" User named: admin\n"
" Email: root@localhost\n"
" Password: based on env variable PAPERLESS_ADMIN_PASSWORD\n"
"No superuser will be created, when:\n"
" - The username is taken already exists\n"
" - A superuser already exists\n"
" - PAPERLESS_ADMIN_PASSWORD is not set"
)
def create_parser(self, *args, **kwargs):
parser = super().create_parser(*args, **kwargs)
parser.formatter_class = RawTextHelpFormatter
return parser
def handle(self, *args, **options):
username = os.getenv("PAPERLESS_ADMIN_USER", "admin")
mail = os.getenv("PAPERLESS_ADMIN_MAIL", "root@localhost")
password = os.getenv("PAPERLESS_ADMIN_PASSWORD")
# Check if there's already a user called admin
if User.objects.filter(username=username).exists():
self.stdout.write(
self.style.NOTICE(
f"Did not create superuser, a user {username} already exists",
),
)
return
# Check if any superuseruser
# exists already, leave as is if it does
if User.objects.filter(is_superuser=True).count() > 0:
self.stdout.write(
self.style.NOTICE(
"Did not create superuser, the DB already contains superusers",
),
)
return
if password is None:
self.stdout.write(
self.style.ERROR(
"Please check if PAPERLESS_ADMIN_PASSWORD has been"
" set in the environment",
),
)
else:
# Create superuser with password based on env variable
User.objects.create_superuser(username, mail, password)
self.stdout.write(
self.style.SUCCESS(
f'Created superuser "{username}" with provided password.',
),
)

View File

@@ -0,0 +1,175 @@
import base64
import os
from argparse import ArgumentParser
from typing import TypedDict
from cryptography.fernet import Fernet
from cryptography.hazmat.primitives import hashes
from cryptography.hazmat.primitives.kdf.pbkdf2 import PBKDF2HMAC
from django.core.management import CommandError
from documents.settings import EXPORTER_CRYPTO_ALGO_NAME
from documents.settings import EXPORTER_CRYPTO_KEY_ITERATIONS_NAME
from documents.settings import EXPORTER_CRYPTO_KEY_SIZE_NAME
from documents.settings import EXPORTER_CRYPTO_SALT_NAME
from documents.settings import EXPORTER_CRYPTO_SETTINGS_NAME
class CryptFields(TypedDict):
exporter_key: str
model_name: str
fields: list[str]
class MultiProcessMixin:
"""
Small class to handle adding an argument and validating it
for the use of multiple processes
"""
def add_argument_processes_mixin(self, parser: ArgumentParser):
parser.add_argument(
"--processes",
default=max(1, os.cpu_count() // 4),
type=int,
help="Number of processes to distribute work amongst",
)
def handle_processes_mixin(self, *args, **options):
self.process_count = options["processes"]
if self.process_count < 1:
raise CommandError("There must be at least 1 process")
class ProgressBarMixin:
"""
Many commands use a progress bar, which can be disabled
via this class
"""
def add_argument_progress_bar_mixin(self, parser: ArgumentParser):
parser.add_argument(
"--no-progress-bar",
default=False,
action="store_true",
help="If set, the progress bar will not be shown",
)
def handle_progress_bar_mixin(self, *args, **options):
self.no_progress_bar = options["no_progress_bar"]
self.use_progress_bar = not self.no_progress_bar
class CryptMixin:
"""
Fully based on:
https://cryptography.io/en/latest/fernet/#using-passwords-with-fernet
To encrypt:
1. Call setup_crypto providing the user provided passphrase
2. Call encrypt_string with a value
3. Store the returned hexadecimal representation of the value
To decrypt:
1. Load the required parameters:
a. key iterations
b. key size
c. key algorithm
2. Call setup_crypto providing the user provided passphrase and stored salt
3. Call decrypt_string with a value
4. Use the returned value
"""
# This matches to Django's default for now
# https://github.com/django/django/blob/adae61942/django/contrib/auth/hashers.py#L315
# Set the defaults to be used during export
# During import, these are overridden from the loaded values to ensure decryption is possible
key_iterations = 1_000_000
salt_size = 16
key_size = 32
kdf_algorithm = "pbkdf2_sha256"
CRYPT_FIELDS: CryptFields = [
{
"exporter_key": "mail_accounts",
"model_name": "paperless_mail.mailaccount",
"fields": [
"password",
"refresh_token",
],
},
{
"exporter_key": "social_tokens",
"model_name": "socialaccount.socialtoken",
"fields": [
"token",
"token_secret",
],
},
]
def get_crypt_params(self) -> dict[str, dict[str, str | int]]:
return {
EXPORTER_CRYPTO_SETTINGS_NAME: {
EXPORTER_CRYPTO_ALGO_NAME: self.kdf_algorithm,
EXPORTER_CRYPTO_KEY_ITERATIONS_NAME: self.key_iterations,
EXPORTER_CRYPTO_KEY_SIZE_NAME: self.key_size,
EXPORTER_CRYPTO_SALT_NAME: self.salt,
},
}
def load_crypt_params(self, metadata: dict):
# Load up the values for setting up decryption
self.kdf_algorithm: str = metadata[EXPORTER_CRYPTO_SETTINGS_NAME][
EXPORTER_CRYPTO_ALGO_NAME
]
self.key_iterations: int = metadata[EXPORTER_CRYPTO_SETTINGS_NAME][
EXPORTER_CRYPTO_KEY_ITERATIONS_NAME
]
self.key_size: int = metadata[EXPORTER_CRYPTO_SETTINGS_NAME][
EXPORTER_CRYPTO_KEY_SIZE_NAME
]
self.salt: str = metadata[EXPORTER_CRYPTO_SETTINGS_NAME][
EXPORTER_CRYPTO_SALT_NAME
]
def setup_crypto(self, *, passphrase: str, salt: str | None = None):
"""
Constructs a class for encryption or decryption using the specified passphrase and salt
Salt is assumed to be a hexadecimal representation of a cryptographically secure random byte string.
If not provided, it will be derived from the system secure random
"""
self.salt = salt or os.urandom(self.salt_size).hex()
# Derive the KDF based on loaded settings
if self.kdf_algorithm == "pbkdf2_sha256":
kdf = PBKDF2HMAC(
algorithm=hashes.SHA256(),
length=self.key_size,
salt=bytes.fromhex(self.salt),
iterations=self.key_iterations,
)
else: # pragma: no cover
raise CommandError(
f"{self.kdf_algorithm} is an unknown key derivation function",
)
key = base64.urlsafe_b64encode(kdf.derive(passphrase.encode("utf-8")))
self.fernet = Fernet(key)
def encrypt_string(self, *, value: str) -> str:
"""
Given a string value, encrypts it and returns the hexadecimal representation of the encrypted token
"""
return self.fernet.encrypt(value.encode("utf-8")).hex()
def decrypt_string(self, *, value: str) -> str:
"""
Given a string value, decrypts it and returns the original value of the field
"""
return self.fernet.decrypt(bytes.fromhex(value)).decode("utf-8")

View File

@@ -0,0 +1,39 @@
from auditlog.models import LogEntry
from django.core.management.base import BaseCommand
from django.db import transaction
from tqdm import tqdm
from documents.management.commands.mixins import ProgressBarMixin
class Command(BaseCommand, ProgressBarMixin):
"""
Prune the audit logs of objects that no longer exist.
"""
help = "Prunes the audit logs of objects that no longer exist."
def add_arguments(self, parser):
self.add_argument_progress_bar_mixin(parser)
def handle(self, **options):
self.handle_progress_bar_mixin(**options)
with transaction.atomic():
for log_entry in tqdm(LogEntry.objects.all(), disable=self.no_progress_bar):
model_class = log_entry.content_type.model_class()
# use global_objects for SoftDeleteModel
objects = (
model_class.global_objects
if hasattr(model_class, "global_objects")
else model_class.objects
)
if (
log_entry.object_id
and not objects.filter(pk=log_entry.object_id).exists()
):
log_entry.delete()
tqdm.write(
self.style.NOTICE(
f"Deleted audit log entry for {model_class.__name__} #{log_entry.object_id}",
),
)

508
src/documents/matching.py Normal file
View File

@@ -0,0 +1,508 @@
from __future__ import annotations
import logging
import re
from fnmatch import fnmatch
from fnmatch import translate as fnmatch_translate
from typing import TYPE_CHECKING
from documents.data_models import ConsumableDocument
from documents.data_models import DocumentSource
from documents.models import Correspondent
from documents.models import Document
from documents.models import DocumentType
from documents.models import MatchingModel
from documents.models import StoragePath
from documents.models import Tag
from documents.models import Workflow
from documents.models import WorkflowTrigger
from documents.permissions import get_objects_for_user_owner_aware
if TYPE_CHECKING:
from django.db.models import QuerySet
from documents.classifier import DocumentClassifier
logger = logging.getLogger("paperless.matching")
def log_reason(
matching_model: MatchingModel | WorkflowTrigger,
document: Document,
reason: str,
):
class_name = type(matching_model).__name__
name = (
matching_model.name if hasattr(matching_model, "name") else str(matching_model)
)
logger.debug(
f"{class_name} {name} matched on document {document} because {reason}",
)
def match_correspondents(document: Document, classifier: DocumentClassifier, user=None):
pred_id = (
classifier.predict_correspondent(document.suggestion_content)
if classifier
else None
)
if user is None and document.owner is not None:
user = document.owner
if user is not None:
correspondents = get_objects_for_user_owner_aware(
user,
"documents.view_correspondent",
Correspondent,
)
else:
correspondents = Correspondent.objects.all()
return list(
filter(
lambda o: matches(o, document)
or (o.pk == pred_id and o.matching_algorithm == MatchingModel.MATCH_AUTO),
correspondents,
),
)
def match_document_types(document: Document, classifier: DocumentClassifier, user=None):
pred_id = (
classifier.predict_document_type(document.suggestion_content)
if classifier
else None
)
if user is None and document.owner is not None:
user = document.owner
if user is not None:
document_types = get_objects_for_user_owner_aware(
user,
"documents.view_documenttype",
DocumentType,
)
else:
document_types = DocumentType.objects.all()
return list(
filter(
lambda o: matches(o, document)
or (o.pk == pred_id and o.matching_algorithm == MatchingModel.MATCH_AUTO),
document_types,
),
)
def match_tags(document: Document, classifier: DocumentClassifier, user=None):
predicted_tag_ids = (
classifier.predict_tags(document.suggestion_content) if classifier else []
)
if user is None and document.owner is not None:
user = document.owner
if user is not None:
tags = get_objects_for_user_owner_aware(user, "documents.view_tag", Tag)
else:
tags = Tag.objects.all()
return list(
filter(
lambda o: matches(o, document)
or (
o.matching_algorithm == MatchingModel.MATCH_AUTO
and o.pk in predicted_tag_ids
),
tags,
),
)
def match_storage_paths(document: Document, classifier: DocumentClassifier, user=None):
pred_id = (
classifier.predict_storage_path(document.suggestion_content)
if classifier
else None
)
if user is None and document.owner is not None:
user = document.owner
if user is not None:
storage_paths = get_objects_for_user_owner_aware(
user,
"documents.view_storagepath",
StoragePath,
)
else:
storage_paths = StoragePath.objects.all()
return list(
filter(
lambda o: matches(o, document)
or (o.pk == pred_id and o.matching_algorithm == MatchingModel.MATCH_AUTO),
storage_paths,
),
)
def matches(matching_model: MatchingModel, document: Document):
search_kwargs = {}
document_content = document.content
# Check that match is not empty
if not matching_model.match.strip():
return False
if matching_model.is_insensitive:
search_kwargs = {"flags": re.IGNORECASE}
if matching_model.matching_algorithm == MatchingModel.MATCH_NONE:
return False
elif matching_model.matching_algorithm == MatchingModel.MATCH_ALL:
for word in _split_match(matching_model):
search_result = re.search(rf"\b{word}\b", document_content, **search_kwargs)
if not search_result:
return False
log_reason(
matching_model,
document,
f"it contains all of these words: {matching_model.match}",
)
return True
elif matching_model.matching_algorithm == MatchingModel.MATCH_ANY:
for word in _split_match(matching_model):
if re.search(rf"\b{word}\b", document_content, **search_kwargs):
log_reason(matching_model, document, f"it contains this word: {word}")
return True
return False
elif matching_model.matching_algorithm == MatchingModel.MATCH_LITERAL:
result = bool(
re.search(
rf"\b{re.escape(matching_model.match)}\b",
document_content,
**search_kwargs,
),
)
if result:
log_reason(
matching_model,
document,
f'it contains this string: "{matching_model.match}"',
)
return result
elif matching_model.matching_algorithm == MatchingModel.MATCH_REGEX:
try:
match = re.search(
re.compile(matching_model.match, **search_kwargs),
document_content,
)
except re.error:
logger.error(
f"Error while processing regular expression {matching_model.match}",
)
return False
if match:
log_reason(
matching_model,
document,
f"the string {match.group()} matches the regular expression "
f"{matching_model.match}",
)
return bool(match)
elif matching_model.matching_algorithm == MatchingModel.MATCH_FUZZY:
from rapidfuzz import fuzz
match = re.sub(r"[^\w\s]", "", matching_model.match)
text = re.sub(r"[^\w\s]", "", document_content)
if matching_model.is_insensitive:
match = match.lower()
text = text.lower()
if fuzz.partial_ratio(match, text, score_cutoff=90):
# TODO: make this better
log_reason(
matching_model,
document,
f"parts of the document content somehow match the string "
f"{matching_model.match}",
)
return True
else:
return False
elif matching_model.matching_algorithm == MatchingModel.MATCH_AUTO:
# this is done elsewhere.
return False
else:
raise NotImplementedError("Unsupported matching algorithm")
def _split_match(matching_model):
"""
Splits the match to individual keywords, getting rid of unnecessary
spaces and grouping quoted words together.
Example:
' some random words "with quotes " and spaces'
==>
["some", "random", "words", "with+quotes", "and", "spaces"]
"""
findterms = re.compile(r'"([^"]+)"|(\S+)').findall
normspace = re.compile(r"\s+").sub
return [
# normspace(" ", (t[0] or t[1]).strip()).replace(" ", r"\s+")
re.escape(normspace(" ", (t[0] or t[1]).strip())).replace(r"\ ", r"\s+")
for t in findterms(matching_model.match)
]
def consumable_document_matches_workflow(
document: ConsumableDocument,
trigger: WorkflowTrigger,
) -> tuple[bool, str]:
"""
Returns True if the ConsumableDocument matches all filters from the workflow trigger,
False otherwise. Includes a reason if doesn't match
"""
trigger_matched = True
reason = ""
# Document source vs trigger source
if len(trigger.sources) > 0 and document.source not in [
int(x) for x in list(trigger.sources)
]:
reason = (
f"Document source {document.source.name} not in"
f" {[DocumentSource(int(x)).name for x in trigger.sources]}",
)
trigger_matched = False
# Document mail rule vs trigger mail rule
if (
trigger.filter_mailrule is not None
and document.mailrule_id != trigger.filter_mailrule.pk
):
reason = (
f"Document mail rule {document.mailrule_id}"
f" != {trigger.filter_mailrule.pk}",
)
trigger_matched = False
# Document filename vs trigger filename
if (
trigger.filter_filename is not None
and len(trigger.filter_filename) > 0
and not fnmatch(
document.original_file.name.lower(),
trigger.filter_filename.lower(),
)
):
reason = (
f"Document filename {document.original_file.name} does not match"
f" {trigger.filter_filename.lower()}",
)
trigger_matched = False
# Document path vs trigger path
# Use the original_path if set, else us the original_file
match_against = (
document.original_path
if document.original_path is not None
else document.original_file
)
if (
trigger.filter_path is not None
and len(trigger.filter_path) > 0
and not fnmatch(
match_against,
trigger.filter_path,
)
):
reason = (
f"Document path {document.original_file}"
f" does not match {trigger.filter_path}",
)
trigger_matched = False
return (trigger_matched, reason)
def existing_document_matches_workflow(
document: Document,
trigger: WorkflowTrigger,
) -> tuple[bool, str]:
"""
Returns True if the Document matches all filters from the workflow trigger,
False otherwise. Includes a reason if doesn't match
"""
trigger_matched = True
reason = ""
if trigger.matching_algorithm > MatchingModel.MATCH_NONE and not matches(
trigger,
document,
):
reason = (
f"Document content matching settings for algorithm '{trigger.matching_algorithm}' did not match",
)
trigger_matched = False
# Document tags vs trigger has_tags
if (
trigger.filter_has_tags.all().count() > 0
and document.tags.filter(
id__in=trigger.filter_has_tags.all().values_list("id"),
).count()
== 0
):
reason = (
f"Document tags {document.tags.all()} do not include"
f" {trigger.filter_has_tags.all()}",
)
trigger_matched = False
# Document correspondent vs trigger has_correspondent
if (
trigger.filter_has_correspondent is not None
and document.correspondent != trigger.filter_has_correspondent
):
reason = (
f"Document correspondent {document.correspondent} does not match {trigger.filter_has_correspondent}",
)
trigger_matched = False
# Document document_type vs trigger has_document_type
if (
trigger.filter_has_document_type is not None
and document.document_type != trigger.filter_has_document_type
):
reason = (
f"Document doc type {document.document_type} does not match {trigger.filter_has_document_type}",
)
trigger_matched = False
# Document storage_path vs trigger has_storage_path
if (
trigger.filter_has_storage_path is not None
and document.storage_path != trigger.filter_has_storage_path
):
reason = (
f"Document storage path {document.storage_path} does not match {trigger.filter_has_storage_path}",
)
trigger_matched = False
# Document original_filename vs trigger filename
if (
trigger.filter_filename is not None
and len(trigger.filter_filename) > 0
and document.original_filename is not None
and not fnmatch(
document.original_filename.lower(),
trigger.filter_filename.lower(),
)
):
reason = (
f"Document filename {document.original_filename} does not match"
f" {trigger.filter_filename.lower()}",
)
trigger_matched = False
return (trigger_matched, reason)
def prefilter_documents_by_workflowtrigger(
documents: QuerySet[Document],
trigger: WorkflowTrigger,
) -> QuerySet[Document]:
"""
To prevent scheduled workflows checking every document, we prefilter the
documents by the workflow trigger filters. This is done before e.g.
document_matches_workflow in run_workflows
"""
if trigger.filter_has_tags.all().count() > 0:
documents = documents.filter(
tags__in=trigger.filter_has_tags.all(),
).distinct()
if trigger.filter_has_correspondent is not None:
documents = documents.filter(
correspondent=trigger.filter_has_correspondent,
)
if trigger.filter_has_document_type is not None:
documents = documents.filter(
document_type=trigger.filter_has_document_type,
)
if trigger.filter_has_storage_path is not None:
documents = documents.filter(
storage_path=trigger.filter_has_storage_path,
)
if trigger.filter_filename is not None and len(trigger.filter_filename) > 0:
# the true fnmatch will actually run later so we just want a loose filter here
regex = fnmatch_translate(trigger.filter_filename).lstrip("^").rstrip("$")
regex = f"(?i){regex}"
documents = documents.filter(original_filename__regex=regex)
return documents
def document_matches_workflow(
document: ConsumableDocument | Document,
workflow: Workflow,
trigger_type: WorkflowTrigger.WorkflowTriggerType,
) -> bool:
"""
Returns True if the ConsumableDocument or Document matches all filters and
settings from the workflow trigger, False otherwise
"""
trigger_matched = True
if workflow.triggers.filter(type=trigger_type).count() == 0:
trigger_matched = False
logger.info(f"Document did not match {workflow}")
logger.debug(f"No matching triggers with type {trigger_type} found")
else:
for trigger in workflow.triggers.filter(type=trigger_type):
if trigger_type == WorkflowTrigger.WorkflowTriggerType.CONSUMPTION:
trigger_matched, reason = consumable_document_matches_workflow(
document,
trigger,
)
elif (
trigger_type == WorkflowTrigger.WorkflowTriggerType.DOCUMENT_ADDED
or trigger_type == WorkflowTrigger.WorkflowTriggerType.DOCUMENT_UPDATED
or trigger_type == WorkflowTrigger.WorkflowTriggerType.SCHEDULED
):
trigger_matched, reason = existing_document_matches_workflow(
document,
trigger,
)
else:
# New trigger types need to be explicitly checked above
raise Exception(f"Trigger type {trigger_type} not yet supported")
if trigger_matched:
logger.info(f"Document matched {trigger} from {workflow}")
# matched, bail early
return True
else:
logger.info(f"Document did not match {workflow}")
logger.debug(reason)
return trigger_matched

View File

@@ -0,0 +1,40 @@
# Generated by Django 1.9 on 2015-12-20 19:10
from django.conf import settings
from django.db import migrations
from django.db import models
class Migration(migrations.Migration):
initial = True
dependencies = []
operations = [
migrations.CreateModel(
name="Document",
fields=[
(
"id",
models.AutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
("sender", models.CharField(blank=True, db_index=True, max_length=128)),
("title", models.CharField(blank=True, db_index=True, max_length=128)),
(
"content",
models.TextField(
db_index=(
"mysql" not in settings.DATABASES["default"]["ENGINE"]
),
),
),
("created", models.DateTimeField(auto_now_add=True)),
("modified", models.DateTimeField(auto_now=True)),
],
),
]

View File

@@ -0,0 +1,26 @@
# Generated by Django 1.9 on 2015-12-26 13:16
import django.utils.timezone
from django.db import migrations
from django.db import models
class Migration(migrations.Migration):
dependencies = [
("documents", "0001_initial"),
]
operations = [
migrations.AlterModelOptions(
name="document",
options={"ordering": ("sender", "title")},
),
migrations.AlterField(
model_name="document",
name="created",
field=models.DateTimeField(
default=django.utils.timezone.now,
editable=False,
),
),
]

View File

@@ -0,0 +1,70 @@
# Generated by Django 1.9 on 2016-01-11 12:21
import django.db.models.deletion
from django.db import migrations
from django.db import models
from django.template.defaultfilters import slugify
DOCUMENT_SENDER_MAP = {}
def move_sender_strings_to_sender_model(apps, schema_editor):
sender_model = apps.get_model("documents", "Sender")
document_model = apps.get_model("documents", "Document")
# Create the sender and log the relationship with the document
for document in document_model.objects.all():
if document.sender:
(
DOCUMENT_SENDER_MAP[document.pk],
_,
) = sender_model.objects.get_or_create(
name=document.sender,
defaults={"slug": slugify(document.sender)},
)
def realign_senders(apps, schema_editor):
document_model = apps.get_model("documents", "Document")
for pk, sender in DOCUMENT_SENDER_MAP.items():
document_model.objects.filter(pk=pk).update(sender=sender)
class Migration(migrations.Migration):
dependencies = [
("documents", "0002_auto_20151226_1316"),
]
operations = [
migrations.CreateModel(
name="Sender",
fields=[
(
"id",
models.AutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
("name", models.CharField(max_length=128, unique=True)),
("slug", models.SlugField()),
],
),
migrations.RunPython(move_sender_strings_to_sender_model),
migrations.RemoveField(
model_name="document",
name="sender",
),
migrations.AddField(
model_name="document",
name="sender",
field=models.ForeignKey(
blank=True,
on_delete=django.db.models.deletion.CASCADE,
to="documents.Sender",
),
),
migrations.RunPython(realign_senders),
]

View File

@@ -0,0 +1,25 @@
# Generated by Django 1.9 on 2016-01-14 18:44
import django.db.models.deletion
from django.db import migrations
from django.db import models
class Migration(migrations.Migration):
dependencies = [
("documents", "0003_sender"),
]
operations = [
migrations.AlterField(
model_name="document",
name="sender",
field=models.ForeignKey(
blank=True,
null=True,
on_delete=django.db.models.deletion.CASCADE,
related_name="documents",
to="documents.Sender",
),
),
]

View File

@@ -0,0 +1,178 @@
# Generated by Django 4.2.13 on 2024-06-28 17:52
import django.db.models.deletion
from django.db import migrations
from django.db import models
class Migration(migrations.Migration):
replaces = [
("documents", "0004_auto_20160114_1844"),
("documents", "0005_auto_20160123_0313"),
("documents", "0006_auto_20160123_0430"),
("documents", "0007_auto_20160126_2114"),
("documents", "0008_document_file_type"),
("documents", "0009_auto_20160214_0040"),
("documents", "0010_log"),
("documents", "0011_auto_20160303_1929"),
]
dependencies = [
("documents", "0003_sender"),
]
operations = [
migrations.AlterField(
model_name="document",
name="sender",
field=models.ForeignKey(
blank=True,
null=True,
on_delete=django.db.models.deletion.CASCADE,
related_name="documents",
to="documents.sender",
),
),
migrations.AlterModelOptions(
name="sender",
options={"ordering": ("name",)},
),
migrations.CreateModel(
name="Tag",
fields=[
(
"id",
models.AutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
("name", models.CharField(max_length=128, unique=True)),
("slug", models.SlugField(blank=True)),
(
"colour",
models.PositiveIntegerField(
choices=[
(1, "#a6cee3"),
(2, "#1f78b4"),
(3, "#b2df8a"),
(4, "#33a02c"),
(5, "#fb9a99"),
(6, "#e31a1c"),
(7, "#fdbf6f"),
(8, "#ff7f00"),
(9, "#cab2d6"),
(10, "#6a3d9a"),
(11, "#b15928"),
(12, "#000000"),
(13, "#cccccc"),
],
default=1,
),
),
("match", models.CharField(blank=True, max_length=256)),
(
"matching_algorithm",
models.PositiveIntegerField(
choices=[
(1, "Any"),
(2, "All"),
(3, "Literal"),
(4, "Regular Expression"),
],
default=1,
help_text='Which algorithm you want to use when matching text to the OCR\'d PDF. Here, "any" looks for any occurrence of any word provided in the PDF, while "all" requires that every word provided appear in the PDF, albeit not in the order provided. A "literal" match means that the text you enter must appear in the PDF exactly as you\'ve entered it, and "regular expression" uses a regex to match the PDF. If you don\'t know what a regex is, you probably don\'t want this option.',
),
),
],
options={
"abstract": False,
},
),
migrations.AlterField(
model_name="sender",
name="slug",
field=models.SlugField(blank=True),
),
migrations.AddField(
model_name="document",
name="file_type",
field=models.CharField(
choices=[
("pdf", "PDF"),
("png", "PNG"),
("jpg", "JPG"),
("gif", "GIF"),
("tiff", "TIFF"),
],
default="pdf",
editable=False,
max_length=4,
),
preserve_default=False,
),
migrations.AddField(
model_name="document",
name="tags",
field=models.ManyToManyField(
blank=True,
related_name="documents",
to="documents.tag",
),
),
migrations.CreateModel(
name="Log",
fields=[
(
"id",
models.AutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
("group", models.UUIDField(blank=True)),
("message", models.TextField()),
(
"level",
models.PositiveIntegerField(
choices=[
(10, "Debugging"),
(20, "Informational"),
(30, "Warning"),
(40, "Error"),
(50, "Critical"),
],
default=20,
),
),
(
"component",
models.PositiveIntegerField(
choices=[(1, "Consumer"), (2, "Mail Fetcher")],
),
),
("created", models.DateTimeField(auto_now_add=True)),
("modified", models.DateTimeField(auto_now=True)),
],
options={
"ordering": ("-modified",),
},
),
migrations.RenameModel(
old_name="Sender",
new_name="Correspondent",
),
migrations.AlterModelOptions(
name="document",
options={"ordering": ("correspondent", "title")},
),
migrations.RenameField(
model_name="document",
old_name="sender",
new_name="correspondent",
),
]

View File

@@ -0,0 +1,16 @@
# Generated by Django 1.9 on 2016-01-23 03:13
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
("documents", "0004_auto_20160114_1844"),
]
operations = [
migrations.AlterModelOptions(
name="sender",
options={"ordering": ("name",)},
),
]

View File

@@ -0,0 +1,64 @@
# Generated by Django 1.9 on 2016-01-23 04:30
from django.db import migrations
from django.db import models
class Migration(migrations.Migration):
dependencies = [
("documents", "0005_auto_20160123_0313"),
]
operations = [
migrations.CreateModel(
name="Tag",
fields=[
(
"id",
models.AutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
("name", models.CharField(max_length=128, unique=True)),
("slug", models.SlugField(blank=True)),
(
"colour",
models.PositiveIntegerField(
choices=[
(1, "#a6cee3"),
(2, "#1f78b4"),
(3, "#b2df8a"),
(4, "#33a02c"),
(5, "#fb9a99"),
(6, "#e31a1c"),
(7, "#fdbf6f"),
(8, "#ff7f00"),
(9, "#cab2d6"),
(10, "#6a3d9a"),
(11, "#ffff99"),
(12, "#b15928"),
(13, "#000000"),
(14, "#cccccc"),
],
default=1,
),
),
],
options={
"abstract": False,
},
),
migrations.AlterField(
model_name="sender",
name="slug",
field=models.SlugField(blank=True),
),
migrations.AddField(
model_name="document",
name="tags",
field=models.ManyToManyField(related_name="documents", to="documents.Tag"),
),
]

View File

@@ -0,0 +1,55 @@
# Generated by Django 1.9 on 2016-01-26 21:14
from django.db import migrations
from django.db import models
class Migration(migrations.Migration):
dependencies = [
("documents", "0006_auto_20160123_0430"),
]
operations = [
migrations.AddField(
model_name="tag",
name="match",
field=models.CharField(blank=True, max_length=256),
),
migrations.AddField(
model_name="tag",
name="matching_algorithm",
field=models.PositiveIntegerField(
blank=True,
choices=[
(1, "Any"),
(2, "All"),
(3, "Literal"),
(4, "Regular Expression"),
],
help_text='Which algorithm you want to use when matching text to the OCR\'d PDF. Here, "any" looks for any occurrence of any word provided in the PDF, while "all" requires that every word provided appear in the PDF, albeit not in the order provided. A "literal" match means that the text you enter must appear in the PDF exactly as you\'ve entered it, and "regular expression" uses a regex to match the PDF. If you don\'t know what a regex is, you probably don\'t want this option.',
null=True,
),
),
migrations.AlterField(
model_name="tag",
name="colour",
field=models.PositiveIntegerField(
choices=[
(1, "#a6cee3"),
(2, "#1f78b4"),
(3, "#b2df8a"),
(4, "#33a02c"),
(5, "#fb9a99"),
(6, "#e31a1c"),
(7, "#fdbf6f"),
(8, "#ff7f00"),
(9, "#cab2d6"),
(10, "#6a3d9a"),
(11, "#b15928"),
(12, "#000000"),
(13, "#cccccc"),
],
default=1,
),
),
]

View File

@@ -0,0 +1,39 @@
# Generated by Django 1.9 on 2016-01-29 22:58
from django.db import migrations
from django.db import models
class Migration(migrations.Migration):
dependencies = [
("documents", "0007_auto_20160126_2114"),
]
operations = [
migrations.AddField(
model_name="document",
name="file_type",
field=models.CharField(
choices=[
("pdf", "PDF"),
("png", "PNG"),
("jpg", "JPG"),
("gif", "GIF"),
("tiff", "TIFF"),
],
default="pdf",
editable=False,
max_length=4,
),
preserve_default=False,
),
migrations.AlterField(
model_name="document",
name="tags",
field=models.ManyToManyField(
blank=True,
related_name="documents",
to="documents.Tag",
),
),
]

View File

@@ -0,0 +1,27 @@
# Generated by Django 1.9 on 2016-02-14 00:40
from django.db import migrations
from django.db import models
class Migration(migrations.Migration):
dependencies = [
("documents", "0008_document_file_type"),
]
operations = [
migrations.AlterField(
model_name="tag",
name="matching_algorithm",
field=models.PositiveIntegerField(
choices=[
(1, "Any"),
(2, "All"),
(3, "Literal"),
(4, "Regular Expression"),
],
default=1,
help_text='Which algorithm you want to use when matching text to the OCR\'d PDF. Here, "any" looks for any occurrence of any word provided in the PDF, while "all" requires that every word provided appear in the PDF, albeit not in the order provided. A "literal" match means that the text you enter must appear in the PDF exactly as you\'ve entered it, and "regular expression" uses a regex to match the PDF. If you don\'t know what a regex is, you probably don\'t want this option.',
),
),
]

View File

@@ -0,0 +1,53 @@
# Generated by Django 1.9 on 2016-02-27 17:54
from django.db import migrations
from django.db import models
class Migration(migrations.Migration):
dependencies = [
("documents", "0009_auto_20160214_0040"),
]
operations = [
migrations.CreateModel(
name="Log",
fields=[
(
"id",
models.AutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
("group", models.UUIDField(blank=True)),
("message", models.TextField()),
(
"level",
models.PositiveIntegerField(
choices=[
(10, "Debugging"),
(20, "Informational"),
(30, "Warning"),
(40, "Error"),
(50, "Critical"),
],
default=20,
),
),
(
"component",
models.PositiveIntegerField(
choices=[(1, "Consumer"), (2, "Mail Fetcher")],
),
),
("created", models.DateTimeField(auto_now_add=True)),
("modified", models.DateTimeField(auto_now=True)),
],
options={
"ordering": ("-modified",),
},
),
]

View File

@@ -0,0 +1,26 @@
# Generated by Django 1.9.2 on 2016-03-03 19:29
from django.db import migrations
class Migration(migrations.Migration):
atomic = False
dependencies = [
("documents", "0010_log"),
]
operations = [
migrations.RenameModel(
old_name="Sender",
new_name="Correspondent",
),
migrations.AlterModelOptions(
name="document",
options={"ordering": ("correspondent", "title")},
),
migrations.RenameField(
model_name="document",
old_name="sender",
new_name="correspondent",
),
]

View File

@@ -0,0 +1,128 @@
# Generated by Django 1.9.2 on 2016-03-05 00:40
import os
import re
import shutil
import subprocess
import tempfile
from pathlib import Path
import gnupg
from django.conf import settings
from django.db import migrations
from django.utils.termcolors import colorize as colourise # Spelling hurts me
class GnuPG:
"""
A handy singleton to use when handling encrypted files.
"""
gpg = gnupg.GPG(gnupghome=settings.GNUPG_HOME)
@classmethod
def decrypted(cls, file_handle):
return cls.gpg.decrypt_file(file_handle, passphrase=settings.PASSPHRASE).data
@classmethod
def encrypted(cls, file_handle):
return cls.gpg.encrypt_file(
file_handle,
recipients=None,
passphrase=settings.PASSPHRASE,
symmetric=True,
).data
def move_documents_and_create_thumbnails(apps, schema_editor):
(Path(settings.MEDIA_ROOT) / "documents" / "originals").mkdir(
parents=True,
exist_ok=True,
)
(Path(settings.MEDIA_ROOT) / "documents" / "thumbnails").mkdir(
parents=True,
exist_ok=True,
)
documents: list[str] = os.listdir(Path(settings.MEDIA_ROOT) / "documents") # noqa: PTH208
if set(documents) == {"originals", "thumbnails"}:
return
print(
colourise(
"\n\n"
" This is a one-time only migration to generate thumbnails for all of your\n"
" documents so that future UIs will have something to work with. If you have\n"
" a lot of documents though, this may take a while, so a coffee break may be\n"
" in order."
"\n",
opts=("bold",),
),
)
Path(settings.SCRATCH_DIR).mkdir(parents=True, exist_ok=True)
for f in sorted(documents):
if not f.endswith("gpg"):
continue
print(
" {} {} {}".format(
colourise("*", fg="green"),
colourise("Generating a thumbnail for", fg="white"),
colourise(f, fg="cyan"),
),
)
thumb_temp: str = tempfile.mkdtemp(prefix="paperless", dir=settings.SCRATCH_DIR)
orig_temp: str = tempfile.mkdtemp(prefix="paperless", dir=settings.SCRATCH_DIR)
orig_source: Path = Path(settings.MEDIA_ROOT) / "documents" / f
orig_target: Path = Path(orig_temp) / f.replace(".gpg", "")
with orig_source.open("rb") as encrypted, orig_target.open("wb") as unencrypted:
unencrypted.write(GnuPG.decrypted(encrypted))
subprocess.Popen(
(
settings.CONVERT_BINARY,
"-scale",
"500x5000",
"-alpha",
"remove",
orig_target,
Path(thumb_temp) / "convert-%04d.png",
),
).wait()
thumb_source: Path = Path(thumb_temp) / "convert-0000.png"
thumb_target: Path = (
Path(settings.MEDIA_ROOT)
/ "documents"
/ "thumbnails"
/ re.sub(r"(\d+)\.\w+(\.gpg)", "\\1.png\\2", f)
)
with (
thumb_source.open("rb") as unencrypted,
thumb_target.open("wb") as encrypted,
):
encrypted.write(GnuPG.encrypted(unencrypted))
shutil.rmtree(thumb_temp)
shutil.rmtree(orig_temp)
shutil.move(
Path(settings.MEDIA_ROOT) / "documents" / f,
Path(settings.MEDIA_ROOT) / "documents" / "originals" / f,
)
class Migration(migrations.Migration):
dependencies = [
("documents", "0011_auto_20160303_1929"),
]
operations = [
migrations.RunPython(move_documents_and_create_thumbnails),
]

View File

@@ -0,0 +1,42 @@
# Generated by Django 1.9.4 on 2016-03-25 21:11
import django.utils.timezone
from django.db import migrations
from django.db import models
class Migration(migrations.Migration):
dependencies = [
("documents", "0012_auto_20160305_0040"),
]
operations = [
migrations.AddField(
model_name="correspondent",
name="match",
field=models.CharField(blank=True, max_length=256),
),
migrations.AddField(
model_name="correspondent",
name="matching_algorithm",
field=models.PositiveIntegerField(
choices=[
(1, "Any"),
(2, "All"),
(3, "Literal"),
(4, "Regular Expression"),
],
default=1,
help_text='Which algorithm you want to use when matching text to the OCR\'d PDF. Here, "any" looks for any occurrence of any word provided in the PDF, while "all" requires that every word provided appear in the PDF, albeit not in the order provided. A "literal" match means that the text you enter must appear in the PDF exactly as you\'ve entered it, and "regular expression" uses a regex to match the PDF. If you don\'t know what a regex is, you probably don\'t want this option.',
),
),
migrations.AlterField(
model_name="document",
name="created",
field=models.DateTimeField(default=django.utils.timezone.now),
),
migrations.RemoveField(
model_name="log",
name="component",
),
]

View File

@@ -0,0 +1,182 @@
# Generated by Django 1.9.4 on 2016-03-28 19:09
import hashlib
from pathlib import Path
import django.utils.timezone
import gnupg
from django.conf import settings
from django.db import migrations
from django.db import models
from django.template.defaultfilters import slugify
from django.utils.termcolors import colorize as colourise # Spelling hurts me
class GnuPG:
"""
A handy singleton to use when handling encrypted files.
"""
gpg = gnupg.GPG(gnupghome=settings.GNUPG_HOME)
@classmethod
def decrypted(cls, file_handle):
return cls.gpg.decrypt_file(file_handle, passphrase=settings.PASSPHRASE).data
@classmethod
def encrypted(cls, file_handle):
return cls.gpg.encrypt_file(
file_handle,
recipients=None,
passphrase=settings.PASSPHRASE,
symmetric=True,
).data
class Document:
"""
Django's migrations restrict access to model methods, so this is a snapshot
of the methods that existed at the time this migration was written, since
we need to make use of a lot of these shortcuts here.
"""
def __init__(self, doc):
self.pk = doc.pk
self.correspondent = doc.correspondent
self.title = doc.title
self.file_type = doc.file_type
self.tags = doc.tags
self.created = doc.created
def __str__(self):
created = self.created.strftime("%Y%m%d%H%M%S")
if self.correspondent and self.title:
return f"{created}: {self.correspondent} - {self.title}"
if self.correspondent or self.title:
return f"{created}: {self.correspondent or self.title}"
return str(created)
@property
def source_path(self):
return (
Path(settings.MEDIA_ROOT)
/ "documents"
/ "originals"
/ f"{self.pk:07}.{self.file_type}.gpg"
)
@property
def source_file(self):
return self.source_path.open("rb")
@property
def file_name(self):
return slugify(str(self)) + "." + self.file_type
def set_checksums(apps, schema_editor):
document_model = apps.get_model("documents", "Document")
if not document_model.objects.all().exists():
return
print(
colourise(
"\n\n"
" This is a one-time only migration to generate checksums for all\n"
" of your existing documents. If you have a lot of documents\n"
" though, this may take a while, so a coffee break may be in\n"
" order."
"\n",
opts=("bold",),
),
)
sums = {}
for d in document_model.objects.all():
document = Document(d)
print(
" {} {} {}".format(
colourise("*", fg="green"),
colourise("Generating a checksum for", fg="white"),
colourise(document.file_name, fg="cyan"),
),
)
with document.source_file as encrypted:
checksum = hashlib.md5(GnuPG.decrypted(encrypted)).hexdigest()
if checksum in sums:
error = "\n{line}{p1}\n\n{doc1}\n{doc2}\n\n{p2}\n\n{code}\n\n{p3}{line}".format(
p1=colourise(
"It appears that you have two identical documents in your collection and \nPaperless no longer supports this (see issue #97). The documents in question\nare:",
fg="yellow",
),
p2=colourise(
"To fix this problem, you'll have to remove one of them from the database, a task\nmost easily done by running the following command in the same\ndirectory as manage.py:",
fg="yellow",
),
p3=colourise(
"When that's finished, re-run the migrate, and provided that there aren't any\nother duplicates, you should be good to go.",
fg="yellow",
),
doc1=colourise(
f" * {sums[checksum][1]} (id: {sums[checksum][0]})",
fg="red",
),
doc2=colourise(
f" * {document.file_name} (id: {document.pk})",
fg="red",
),
code=colourise(
f" $ echo 'DELETE FROM documents_document WHERE id = {document.pk};' | ./manage.py dbshell",
fg="green",
),
line=colourise("\n{}\n".format("=" * 80), fg="white", opts=("bold",)),
)
raise RuntimeError(error)
sums[checksum] = (document.pk, document.file_name)
document_model.objects.filter(pk=document.pk).update(checksum=checksum)
def do_nothing(apps, schema_editor):
pass
class Migration(migrations.Migration):
dependencies = [
("documents", "0013_auto_20160325_2111"),
]
operations = [
migrations.AddField(
model_name="document",
name="checksum",
field=models.CharField(
default="-",
db_index=True,
editable=False,
max_length=32,
help_text="The checksum of the original document (before it "
"was encrypted). We use this to prevent duplicate "
"document imports.",
),
preserve_default=False,
),
migrations.RunPython(set_checksums, do_nothing),
migrations.AlterField(
model_name="document",
name="created",
field=models.DateTimeField(
db_index=True,
default=django.utils.timezone.now,
),
),
migrations.AlterField(
model_name="document",
name="modified",
field=models.DateTimeField(auto_now=True, db_index=True),
),
]

View File

@@ -0,0 +1,33 @@
# Generated by Django 1.10.2 on 2016-10-05 21:38
from django.db import migrations
from django.db import models
class Migration(migrations.Migration):
dependencies = [
("documents", "0014_document_checksum"),
]
operations = [
migrations.AlterField(
model_name="document",
name="checksum",
field=models.CharField(
editable=False,
help_text="The checksum of the original document (before it was encrypted). We use this to prevent duplicate document imports.",
max_length=32,
unique=True,
),
),
migrations.AddField(
model_name="correspondent",
name="is_insensitive",
field=models.BooleanField(default=True),
),
migrations.AddField(
model_name="tag",
name="is_insensitive",
field=models.BooleanField(default=True),
),
]

View File

@@ -0,0 +1,92 @@
# Generated by Django 4.2.13 on 2024-06-28 17:57
import django.db.models.deletion
from django.conf import settings
from django.db import migrations
from django.db import models
class Migration(migrations.Migration):
replaces = [
("documents", "0015_add_insensitive_to_match"),
("documents", "0016_auto_20170325_1558"),
("documents", "0017_auto_20170512_0507"),
("documents", "0018_auto_20170715_1712"),
]
dependencies = [
("documents", "0014_document_checksum"),
]
operations = [
migrations.AlterField(
model_name="document",
name="checksum",
field=models.CharField(
editable=False,
help_text="The checksum of the original document (before it was encrypted). We use this to prevent duplicate document imports.",
max_length=32,
unique=True,
),
),
migrations.AddField(
model_name="correspondent",
name="is_insensitive",
field=models.BooleanField(default=True),
),
migrations.AddField(
model_name="tag",
name="is_insensitive",
field=models.BooleanField(default=True),
),
migrations.AlterField(
model_name="document",
name="content",
field=models.TextField(
blank=True,
db_index=("mysql" not in settings.DATABASES["default"]["ENGINE"]),
help_text="The raw, text-only data of the document. This field is primarily used for searching.",
),
),
migrations.AlterField(
model_name="correspondent",
name="matching_algorithm",
field=models.PositiveIntegerField(
choices=[
(1, "Any"),
(2, "All"),
(3, "Literal"),
(4, "Regular Expression"),
(5, "Fuzzy Match"),
],
default=1,
help_text='Which algorithm you want to use when matching text to the OCR\'d PDF. Here, "any" looks for any occurrence of any word provided in the PDF, while "all" requires that every word provided appear in the PDF, albeit not in the order provided. A "literal" match means that the text you enter must appear in the PDF exactly as you\'ve entered it, and "regular expression" uses a regex to match the PDF. (If you don\'t know what a regex is, you probably don\'t want this option.) Finally, a "fuzzy match" looks for words or phrases that are mostly—but not exactly—the same, which can be useful for matching against documents containing imperfections that foil accurate OCR.',
),
),
migrations.AlterField(
model_name="tag",
name="matching_algorithm",
field=models.PositiveIntegerField(
choices=[
(1, "Any"),
(2, "All"),
(3, "Literal"),
(4, "Regular Expression"),
(5, "Fuzzy Match"),
],
default=1,
help_text='Which algorithm you want to use when matching text to the OCR\'d PDF. Here, "any" looks for any occurrence of any word provided in the PDF, while "all" requires that every word provided appear in the PDF, albeit not in the order provided. A "literal" match means that the text you enter must appear in the PDF exactly as you\'ve entered it, and "regular expression" uses a regex to match the PDF. (If you don\'t know what a regex is, you probably don\'t want this option.) Finally, a "fuzzy match" looks for words or phrases that are mostly—but not exactly—the same, which can be useful for matching against documents containing imperfections that foil accurate OCR.',
),
),
migrations.AlterField(
model_name="document",
name="correspondent",
field=models.ForeignKey(
blank=True,
null=True,
on_delete=django.db.models.deletion.SET_NULL,
related_name="documents",
to="documents.correspondent",
),
),
]

View File

@@ -0,0 +1,23 @@
# Generated by Django 1.10.5 on 2017-03-25 15:58
from django.conf import settings
from django.db import migrations
from django.db import models
class Migration(migrations.Migration):
dependencies = [
("documents", "0015_add_insensitive_to_match"),
]
operations = [
migrations.AlterField(
model_name="document",
name="content",
field=models.TextField(
blank=True,
db_index=("mysql" not in settings.DATABASES["default"]["ENGINE"]),
help_text="The raw, text-only data of the document. This field is primarily used for searching.",
),
),
]

View File

@@ -0,0 +1,43 @@
# Generated by Django 1.10.5 on 2017-05-12 05:07
from django.db import migrations
from django.db import models
class Migration(migrations.Migration):
dependencies = [
("documents", "0016_auto_20170325_1558"),
]
operations = [
migrations.AlterField(
model_name="correspondent",
name="matching_algorithm",
field=models.PositiveIntegerField(
choices=[
(1, "Any"),
(2, "All"),
(3, "Literal"),
(4, "Regular Expression"),
(5, "Fuzzy Match"),
],
default=1,
help_text='Which algorithm you want to use when matching text to the OCR\'d PDF. Here, "any" looks for any occurrence of any word provided in the PDF, while "all" requires that every word provided appear in the PDF, albeit not in the order provided. A "literal" match means that the text you enter must appear in the PDF exactly as you\'ve entered it, and "regular expression" uses a regex to match the PDF. (If you don\'t know what a regex is, you probably don\'t want this option.) Finally, a "fuzzy match" looks for words or phrases that are mostly—but not exactly—the same, which can be useful for matching against documents containing imperfections that foil accurate OCR.',
),
),
migrations.AlterField(
model_name="tag",
name="matching_algorithm",
field=models.PositiveIntegerField(
choices=[
(1, "Any"),
(2, "All"),
(3, "Literal"),
(4, "Regular Expression"),
(5, "Fuzzy Match"),
],
default=1,
help_text='Which algorithm you want to use when matching text to the OCR\'d PDF. Here, "any" looks for any occurrence of any word provided in the PDF, while "all" requires that every word provided appear in the PDF, albeit not in the order provided. A "literal" match means that the text you enter must appear in the PDF exactly as you\'ve entered it, and "regular expression" uses a regex to match the PDF. (If you don\'t know what a regex is, you probably don\'t want this option.) Finally, a "fuzzy match" looks for words or phrases that are mostly—but not exactly—the same, which can be useful for matching against documents containing imperfections that foil accurate OCR.',
),
),
]

View File

@@ -0,0 +1,25 @@
# Generated by Django 1.10.5 on 2017-07-15 17:12
import django.db.models.deletion
from django.db import migrations
from django.db import models
class Migration(migrations.Migration):
dependencies = [
("documents", "0017_auto_20170512_0507"),
]
operations = [
migrations.AlterField(
model_name="document",
name="correspondent",
field=models.ForeignKey(
blank=True,
null=True,
on_delete=django.db.models.deletion.SET_NULL,
related_name="documents",
to="documents.Correspondent",
),
),
]

View File

@@ -0,0 +1,22 @@
# Generated by Django 1.10.5 on 2017-07-15 17:12
from django.contrib.auth.models import User
from django.db import migrations
def forwards_func(apps, schema_editor):
User.objects.create(username="consumer")
def reverse_func(apps, schema_editor):
User.objects.get(username="consumer").delete()
class Migration(migrations.Migration):
dependencies = [
("documents", "0018_auto_20170715_1712"),
]
operations = [
migrations.RunPython(forwards_func, reverse_func),
]

View File

@@ -0,0 +1,29 @@
import django.utils.timezone
from django.db import migrations
from django.db import models
def set_added_time_to_created_time(apps, schema_editor):
Document = apps.get_model("documents", "Document")
for doc in Document.objects.all():
doc.added = doc.created
doc.save()
class Migration(migrations.Migration):
dependencies = [
("documents", "0019_add_consumer_user"),
]
operations = [
migrations.AddField(
model_name="document",
name="added",
field=models.DateTimeField(
db_index=True,
default=django.utils.timezone.now,
editable=False,
),
),
migrations.RunPython(set_added_time_to_created_time),
]

View File

@@ -0,0 +1,41 @@
# Generated by Django 1.11.10 on 2018-02-04 13:07
from django.db import migrations
from django.db import models
class Migration(migrations.Migration):
dependencies = [
("documents", "0020_document_added"),
]
operations = [
# Add the field with the default GPG-encrypted value
migrations.AddField(
model_name="document",
name="storage_type",
field=models.CharField(
choices=[
("unencrypted", "Unencrypted"),
("gpg", "Encrypted with GNU Privacy Guard"),
],
default="gpg",
editable=False,
max_length=11,
),
),
# Now that the field is added, change the default to unencrypted
migrations.AlterField(
model_name="document",
name="storage_type",
field=models.CharField(
choices=[
("unencrypted", "Unencrypted"),
("gpg", "Encrypted with GNU Privacy Guard"),
],
default="unencrypted",
editable=False,
max_length=11,
),
),
]

View File

@@ -0,0 +1,61 @@
# Generated by Django 2.0.8 on 2018-10-07 14:20
from django.db import migrations
from django.db import models
from django.utils.text import slugify
def re_slug_all_the_things(apps, schema_editor):
"""
Rewrite all slug values to make sure they're actually slugs before we brand
them as uneditable.
"""
Tag = apps.get_model("documents", "Tag")
Correspondent = apps.get_model("documents", "Correspondent")
for klass in (Tag, Correspondent):
for instance in klass.objects.all():
klass.objects.filter(pk=instance.pk).update(slug=slugify(instance.slug))
class Migration(migrations.Migration):
dependencies = [
("documents", "0021_document_storage_type"),
]
operations = [
migrations.AlterModelOptions(
name="tag",
options={"ordering": ("name",)},
),
migrations.AlterField(
model_name="correspondent",
name="slug",
field=models.SlugField(blank=True, editable=False),
),
migrations.AlterField(
model_name="document",
name="file_type",
field=models.CharField(
choices=[
("pdf", "PDF"),
("png", "PNG"),
("jpg", "JPG"),
("gif", "GIF"),
("tiff", "TIFF"),
("txt", "TXT"),
("csv", "CSV"),
("md", "MD"),
],
editable=False,
max_length=4,
),
),
migrations.AlterField(
model_name="tag",
name="slug",
field=models.SlugField(blank=True, editable=False),
),
migrations.RunPython(re_slug_all_the_things, migrations.RunPython.noop),
]

View File

@@ -0,0 +1,39 @@
# Generated by Django 2.0.10 on 2019-04-26 18:57
from django.db import migrations
from django.db import models
def set_filename(apps, schema_editor):
Document = apps.get_model("documents", "Document")
for doc in Document.objects.all():
file_name = f"{doc.pk:07}.{doc.file_type}"
if doc.storage_type == "gpg":
file_name += ".gpg"
# Set filename
doc.filename = file_name
# Save document
doc.save()
class Migration(migrations.Migration):
dependencies = [
("documents", "0022_auto_20181007_1420"),
]
operations = [
migrations.AddField(
model_name="document",
name="filename",
field=models.FilePathField(
default=None,
null=True,
editable=False,
help_text="Current filename in storage",
max_length=256,
),
),
migrations.RunPython(set_filename),
]

View File

@@ -0,0 +1,147 @@
# Generated by Django 3.1.3 on 2020-11-07 12:35
import uuid
import django.db.models.deletion
from django.db import migrations
from django.db import models
def logs_set_default_group(apps, schema_editor):
Log = apps.get_model("documents", "Log")
for log in Log.objects.all():
if log.group is None:
log.group = uuid.uuid4()
log.save()
class Migration(migrations.Migration):
dependencies = [
("documents", "0023_document_current_filename"),
]
operations = [
migrations.AddField(
model_name="document",
name="archive_serial_number",
field=models.IntegerField(
blank=True,
db_index=True,
help_text="The position of this document in your physical document archive.",
null=True,
unique=True,
),
),
migrations.AddField(
model_name="tag",
name="is_inbox_tag",
field=models.BooleanField(
default=False,
help_text="Marks this tag as an inbox tag: All newly consumed documents will be tagged with inbox tags.",
),
),
migrations.CreateModel(
name="DocumentType",
fields=[
(
"id",
models.AutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
("name", models.CharField(max_length=128, unique=True)),
("slug", models.SlugField(blank=True, editable=False)),
("match", models.CharField(blank=True, max_length=256)),
(
"matching_algorithm",
models.PositiveIntegerField(
choices=[
(1, "Any"),
(2, "All"),
(3, "Literal"),
(4, "Regular Expression"),
(5, "Fuzzy Match"),
(6, "Automatic Classification"),
],
default=1,
help_text='Which algorithm you want to use when matching text to the OCR\'d PDF. Here, "any" looks for any occurrence of any word provided in the PDF, while "all" requires that every word provided appear in the PDF, albeit not in the order provided. A "literal" match means that the text you enter must appear in the PDF exactly as you\'ve entered it, and "regular expression" uses a regex to match the PDF. (If you don\'t know what a regex is, you probably don\'t want this option.) Finally, a "fuzzy match" looks for words or phrases that are mostly—but not exactly—the same, which can be useful for matching against documents containing imperfections that foil accurate OCR.',
),
),
("is_insensitive", models.BooleanField(default=True)),
],
options={
"abstract": False,
"ordering": ("name",),
},
),
migrations.AddField(
model_name="document",
name="document_type",
field=models.ForeignKey(
blank=True,
null=True,
on_delete=django.db.models.deletion.SET_NULL,
related_name="documents",
to="documents.documenttype",
),
),
migrations.AlterField(
model_name="correspondent",
name="matching_algorithm",
field=models.PositiveIntegerField(
choices=[
(1, "Any"),
(2, "All"),
(3, "Literal"),
(4, "Regular Expression"),
(5, "Fuzzy Match"),
(6, "Automatic Classification"),
],
default=1,
help_text='Which algorithm you want to use when matching text to the OCR\'d PDF. Here, "any" looks for any occurrence of any word provided in the PDF, while "all" requires that every word provided appear in the PDF, albeit not in the order provided. A "literal" match means that the text you enter must appear in the PDF exactly as you\'ve entered it, and "regular expression" uses a regex to match the PDF. (If you don\'t know what a regex is, you probably don\'t want this option.) Finally, a "fuzzy match" looks for words or phrases that are mostly—but not exactly—the same, which can be useful for matching against documents containing imperfections that foil accurate OCR.',
),
),
migrations.AlterField(
model_name="tag",
name="matching_algorithm",
field=models.PositiveIntegerField(
choices=[
(1, "Any"),
(2, "All"),
(3, "Literal"),
(4, "Regular Expression"),
(5, "Fuzzy Match"),
(6, "Automatic Classification"),
],
default=1,
help_text='Which algorithm you want to use when matching text to the OCR\'d PDF. Here, "any" looks for any occurrence of any word provided in the PDF, while "all" requires that every word provided appear in the PDF, albeit not in the order provided. A "literal" match means that the text you enter must appear in the PDF exactly as you\'ve entered it, and "regular expression" uses a regex to match the PDF. (If you don\'t know what a regex is, you probably don\'t want this option.) Finally, a "fuzzy match" looks for words or phrases that are mostly—but not exactly—the same, which can be useful for matching against documents containing imperfections that foil accurate OCR.',
),
),
migrations.AlterField(
model_name="document",
name="content",
field=models.TextField(
blank=True,
help_text="The raw, text-only data of the document. This field is primarily used for searching.",
),
),
migrations.AlterModelOptions(
name="log",
options={"ordering": ("-created",)},
),
migrations.RemoveField(
model_name="log",
name="modified",
),
migrations.AlterField(
model_name="log",
name="group",
field=models.UUIDField(blank=True, null=True),
),
migrations.RunPython(
code=django.db.migrations.operations.special.RunPython.noop,
reverse_code=logs_set_default_group,
),
]

View File

@@ -0,0 +1,13 @@
# Generated by Django 3.1.3 on 2020-11-09 16:36
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
("documents", "1000_update_paperless_all"),
]
operations = [
migrations.RunPython(migrations.RunPython.noop, migrations.RunPython.noop),
]

View File

@@ -0,0 +1,24 @@
# Generated by Django 3.1.3 on 2020-11-11 11:05
from django.db import migrations
from django.db import models
class Migration(migrations.Migration):
dependencies = [
("documents", "1001_auto_20201109_1636"),
]
operations = [
migrations.AlterField(
model_name="document",
name="filename",
field=models.FilePathField(
default=None,
editable=False,
help_text="Current filename in storage",
max_length=1024,
null=True,
),
),
]

View File

@@ -0,0 +1,92 @@
# Generated by Django 3.1.3 on 2020-11-20 11:21
from pathlib import Path
import magic
from django.conf import settings
from django.db import migrations
from django.db import models
from paperless.db import GnuPG
STORAGE_TYPE_UNENCRYPTED = "unencrypted"
STORAGE_TYPE_GPG = "gpg"
def source_path(self) -> Path:
if self.filename:
fname: str = str(self.filename)
else:
fname = f"{self.pk:07}.{self.file_type}"
if self.storage_type == STORAGE_TYPE_GPG:
fname += ".gpg"
return Path(settings.ORIGINALS_DIR) / fname
def add_mime_types(apps, schema_editor):
Document = apps.get_model("documents", "Document")
documents = Document.objects.all()
for d in documents:
with Path(source_path(d)).open("rb") as f:
if d.storage_type == STORAGE_TYPE_GPG:
data = GnuPG.decrypted(f)
else:
data = f.read(1024)
d.mime_type = magic.from_buffer(data, mime=True)
d.save()
def add_file_extensions(apps, schema_editor):
Document = apps.get_model("documents", "Document")
documents = Document.objects.all()
for d in documents:
d.file_type = Path(d.filename).suffix.lstrip(".")
d.save()
class Migration(migrations.Migration):
dependencies = [
("documents", "1002_auto_20201111_1105"),
]
operations = [
migrations.AddField(
model_name="document",
name="mime_type",
field=models.CharField(default="-", editable=False, max_length=256),
preserve_default=False,
),
migrations.RunPython(add_mime_types, migrations.RunPython.noop),
# This operation is here so that we can revert the entire migration:
# By allowing this field to be blank and null, we can revert the
# remove operation further down and the database won't complain about
# NOT NULL violations.
migrations.AlterField(
model_name="document",
name="file_type",
field=models.CharField(
choices=[
("pdf", "PDF"),
("png", "PNG"),
("jpg", "JPG"),
("gif", "GIF"),
("tiff", "TIFF"),
("txt", "TXT"),
("csv", "CSV"),
("md", "MD"),
],
editable=False,
max_length=4,
null=True,
blank=True,
),
),
migrations.RunPython(migrations.RunPython.noop, add_file_extensions),
migrations.RemoveField(
model_name="document",
name="file_type",
),
]

View File

@@ -0,0 +1,12 @@
# Generated by Django 3.1.3 on 2020-11-25 14:53
from django.db import migrations
from django.db.migrations import RunPython
class Migration(migrations.Migration):
dependencies = [
("documents", "1003_mime_types"),
]
operations = [RunPython(migrations.RunPython.noop, migrations.RunPython.noop)]

View File

@@ -0,0 +1,34 @@
# Generated by Django 3.1.3 on 2020-11-29 00:48
from django.db import migrations
from django.db import models
class Migration(migrations.Migration):
dependencies = [
("documents", "1004_sanity_check_schedule"),
]
operations = [
migrations.AddField(
model_name="document",
name="archive_checksum",
field=models.CharField(
blank=True,
editable=False,
help_text="The checksum of the archived document.",
max_length=32,
null=True,
),
),
migrations.AlterField(
model_name="document",
name="checksum",
field=models.CharField(
editable=False,
help_text="The checksum of the original document.",
max_length=32,
unique=True,
),
),
]

View File

@@ -0,0 +1,24 @@
# Generated by Django 3.1.4 on 2020-12-08 22:09
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
("documents", "1005_checksums"),
]
operations = [
migrations.RemoveField(
model_name="correspondent",
name="slug",
),
migrations.RemoveField(
model_name="documenttype",
name="slug",
),
migrations.RemoveField(
model_name="tag",
name="slug",
),
]

View File

@@ -0,0 +1,485 @@
# Generated by Django 4.2.13 on 2024-06-28 18:01
import django.db.models.deletion
import django.utils.timezone
from django.conf import settings
from django.db import migrations
from django.db import models
class Migration(migrations.Migration):
replaces = [
("documents", "1006_auto_20201208_2209"),
("documents", "1007_savedview_savedviewfilterrule"),
("documents", "1008_auto_20201216_1736"),
("documents", "1009_auto_20201216_2005"),
("documents", "1010_auto_20210101_2159"),
("documents", "1011_auto_20210101_2340"),
]
dependencies = [
("documents", "1005_checksums"),
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
]
operations = [
migrations.RemoveField(
model_name="correspondent",
name="slug",
),
migrations.RemoveField(
model_name="documenttype",
name="slug",
),
migrations.RemoveField(
model_name="tag",
name="slug",
),
migrations.CreateModel(
name="SavedView",
fields=[
(
"id",
models.AutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
("name", models.CharField(max_length=128, verbose_name="name")),
(
"show_on_dashboard",
models.BooleanField(verbose_name="show on dashboard"),
),
(
"show_in_sidebar",
models.BooleanField(verbose_name="show in sidebar"),
),
(
"sort_field",
models.CharField(max_length=128, verbose_name="sort field"),
),
(
"sort_reverse",
models.BooleanField(default=False, verbose_name="sort reverse"),
),
(
"user",
models.ForeignKey(
on_delete=django.db.models.deletion.CASCADE,
to=settings.AUTH_USER_MODEL,
verbose_name="user",
),
),
],
options={
"ordering": ("name",),
"verbose_name": "saved view",
"verbose_name_plural": "saved views",
},
),
migrations.AlterModelOptions(
name="correspondent",
options={
"ordering": ("name",),
"verbose_name": "correspondent",
"verbose_name_plural": "correspondents",
},
),
migrations.AlterModelOptions(
name="document",
options={
"ordering": ("-created",),
"verbose_name": "document",
"verbose_name_plural": "documents",
},
),
migrations.AlterModelOptions(
name="documenttype",
options={
"verbose_name": "document type",
"verbose_name_plural": "document types",
},
),
migrations.AlterModelOptions(
name="log",
options={
"ordering": ("-created",),
"verbose_name": "log",
"verbose_name_plural": "logs",
},
),
migrations.AlterModelOptions(
name="tag",
options={"verbose_name": "tag", "verbose_name_plural": "tags"},
),
migrations.AlterField(
model_name="correspondent",
name="is_insensitive",
field=models.BooleanField(default=True, verbose_name="is insensitive"),
),
migrations.AlterField(
model_name="correspondent",
name="match",
field=models.CharField(blank=True, max_length=256, verbose_name="match"),
),
migrations.AlterField(
model_name="correspondent",
name="matching_algorithm",
field=models.PositiveIntegerField(
choices=[
(1, "Any word"),
(2, "All words"),
(3, "Exact match"),
(4, "Regular expression"),
(5, "Fuzzy word"),
(6, "Automatic"),
],
default=1,
verbose_name="matching algorithm",
),
),
migrations.AlterField(
model_name="correspondent",
name="name",
field=models.CharField(max_length=128, unique=True, verbose_name="name"),
),
migrations.AlterField(
model_name="document",
name="added",
field=models.DateTimeField(
db_index=True,
default=django.utils.timezone.now,
editable=False,
verbose_name="added",
),
),
migrations.AlterField(
model_name="document",
name="archive_checksum",
field=models.CharField(
blank=True,
editable=False,
help_text="The checksum of the archived document.",
max_length=32,
null=True,
verbose_name="archive checksum",
),
),
migrations.AlterField(
model_name="document",
name="archive_serial_number",
field=models.IntegerField(
blank=True,
db_index=True,
help_text="The position of this document in your physical document archive.",
null=True,
unique=True,
verbose_name="archive serial number",
),
),
migrations.AlterField(
model_name="document",
name="checksum",
field=models.CharField(
editable=False,
help_text="The checksum of the original document.",
max_length=32,
unique=True,
verbose_name="checksum",
),
),
migrations.AlterField(
model_name="document",
name="content",
field=models.TextField(
blank=True,
help_text="The raw, text-only data of the document. This field is primarily used for searching.",
verbose_name="content",
),
),
migrations.AlterField(
model_name="document",
name="correspondent",
field=models.ForeignKey(
blank=True,
null=True,
on_delete=django.db.models.deletion.SET_NULL,
related_name="documents",
to="documents.correspondent",
verbose_name="correspondent",
),
),
migrations.AlterField(
model_name="document",
name="created",
field=models.DateTimeField(
db_index=True,
default=django.utils.timezone.now,
verbose_name="created",
),
),
migrations.AlterField(
model_name="document",
name="document_type",
field=models.ForeignKey(
blank=True,
null=True,
on_delete=django.db.models.deletion.SET_NULL,
related_name="documents",
to="documents.documenttype",
verbose_name="document type",
),
),
migrations.AlterField(
model_name="document",
name="filename",
field=models.FilePathField(
default=None,
editable=False,
help_text="Current filename in storage",
max_length=1024,
null=True,
verbose_name="filename",
),
),
migrations.AlterField(
model_name="document",
name="mime_type",
field=models.CharField(
editable=False,
max_length=256,
verbose_name="mime type",
),
),
migrations.AlterField(
model_name="document",
name="modified",
field=models.DateTimeField(
auto_now=True,
db_index=True,
verbose_name="modified",
),
),
migrations.AlterField(
model_name="document",
name="storage_type",
field=models.CharField(
choices=[
("unencrypted", "Unencrypted"),
("gpg", "Encrypted with GNU Privacy Guard"),
],
default="unencrypted",
editable=False,
max_length=11,
verbose_name="storage type",
),
),
migrations.AlterField(
model_name="document",
name="tags",
field=models.ManyToManyField(
blank=True,
related_name="documents",
to="documents.tag",
verbose_name="tags",
),
),
migrations.AlterField(
model_name="document",
name="title",
field=models.CharField(
blank=True,
db_index=True,
max_length=128,
verbose_name="title",
),
),
migrations.AlterField(
model_name="documenttype",
name="is_insensitive",
field=models.BooleanField(default=True, verbose_name="is insensitive"),
),
migrations.AlterField(
model_name="documenttype",
name="match",
field=models.CharField(blank=True, max_length=256, verbose_name="match"),
),
migrations.AlterField(
model_name="documenttype",
name="matching_algorithm",
field=models.PositiveIntegerField(
choices=[
(1, "Any word"),
(2, "All words"),
(3, "Exact match"),
(4, "Regular expression"),
(5, "Fuzzy word"),
(6, "Automatic"),
],
default=1,
verbose_name="matching algorithm",
),
),
migrations.AlterField(
model_name="documenttype",
name="name",
field=models.CharField(max_length=128, unique=True, verbose_name="name"),
),
migrations.AlterField(
model_name="log",
name="created",
field=models.DateTimeField(auto_now_add=True, verbose_name="created"),
),
migrations.AlterField(
model_name="log",
name="group",
field=models.UUIDField(blank=True, null=True, verbose_name="group"),
),
migrations.AlterField(
model_name="log",
name="level",
field=models.PositiveIntegerField(
choices=[
(10, "debug"),
(20, "information"),
(30, "warning"),
(40, "error"),
(50, "critical"),
],
default=20,
verbose_name="level",
),
),
migrations.AlterField(
model_name="log",
name="message",
field=models.TextField(verbose_name="message"),
),
migrations.CreateModel(
name="SavedViewFilterRule",
fields=[
(
"id",
models.AutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
(
"rule_type",
models.PositiveIntegerField(
choices=[
(0, "title contains"),
(1, "content contains"),
(2, "ASN is"),
(3, "correspondent is"),
(4, "document type is"),
(5, "is in inbox"),
(6, "has tag"),
(7, "has any tag"),
(8, "created before"),
(9, "created after"),
(10, "created year is"),
(11, "created month is"),
(12, "created day is"),
(13, "added before"),
(14, "added after"),
(15, "modified before"),
(16, "modified after"),
(17, "does not have tag"),
],
verbose_name="rule type",
),
),
(
"value",
models.CharField(
blank=True,
max_length=128,
null=True,
verbose_name="value",
),
),
(
"saved_view",
models.ForeignKey(
on_delete=django.db.models.deletion.CASCADE,
related_name="filter_rules",
to="documents.savedview",
verbose_name="saved view",
),
),
],
options={
"verbose_name": "filter rule",
"verbose_name_plural": "filter rules",
},
),
migrations.AlterField(
model_name="tag",
name="colour",
field=models.PositiveIntegerField(
choices=[
(1, "#a6cee3"),
(2, "#1f78b4"),
(3, "#b2df8a"),
(4, "#33a02c"),
(5, "#fb9a99"),
(6, "#e31a1c"),
(7, "#fdbf6f"),
(8, "#ff7f00"),
(9, "#cab2d6"),
(10, "#6a3d9a"),
(11, "#b15928"),
(12, "#000000"),
(13, "#cccccc"),
],
default=1,
verbose_name="color",
),
),
migrations.AlterField(
model_name="tag",
name="is_inbox_tag",
field=models.BooleanField(
default=False,
help_text="Marks this tag as an inbox tag: All newly consumed documents will be tagged with inbox tags.",
verbose_name="is inbox tag",
),
),
migrations.AlterField(
model_name="tag",
name="is_insensitive",
field=models.BooleanField(default=True, verbose_name="is insensitive"),
),
migrations.AlterField(
model_name="tag",
name="match",
field=models.CharField(blank=True, max_length=256, verbose_name="match"),
),
migrations.AlterField(
model_name="tag",
name="matching_algorithm",
field=models.PositiveIntegerField(
choices=[
(1, "Any word"),
(2, "All words"),
(3, "Exact match"),
(4, "Regular expression"),
(5, "Fuzzy word"),
(6, "Automatic"),
],
default=1,
verbose_name="matching algorithm",
),
),
migrations.AlterField(
model_name="tag",
name="name",
field=models.CharField(max_length=128, unique=True, verbose_name="name"),
),
]

View File

@@ -0,0 +1,90 @@
# Generated by Django 3.1.4 on 2020-12-12 14:41
import django.db.models.deletion
from django.conf import settings
from django.db import migrations
from django.db import models
class Migration(migrations.Migration):
dependencies = [
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
("documents", "1006_auto_20201208_2209"),
]
operations = [
migrations.CreateModel(
name="SavedView",
fields=[
(
"id",
models.AutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
("name", models.CharField(max_length=128)),
("show_on_dashboard", models.BooleanField()),
("show_in_sidebar", models.BooleanField()),
("sort_field", models.CharField(max_length=128)),
("sort_reverse", models.BooleanField(default=False)),
(
"user",
models.ForeignKey(
on_delete=django.db.models.deletion.CASCADE,
to=settings.AUTH_USER_MODEL,
),
),
],
),
migrations.CreateModel(
name="SavedViewFilterRule",
fields=[
(
"id",
models.AutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
(
"rule_type",
models.PositiveIntegerField(
choices=[
(0, "Title contains"),
(1, "Content contains"),
(2, "ASN is"),
(3, "Correspondent is"),
(4, "Document type is"),
(5, "Is in inbox"),
(6, "Has tag"),
(7, "Has any tag"),
(8, "Created before"),
(9, "Created after"),
(10, "Created year is"),
(11, "Created month is"),
(12, "Created day is"),
(13, "Added before"),
(14, "Added after"),
(15, "Modified before"),
(16, "Modified after"),
(17, "Does not have tag"),
],
),
),
("value", models.CharField(max_length=128)),
(
"saved_view",
models.ForeignKey(
on_delete=django.db.models.deletion.CASCADE,
related_name="filter_rules",
to="documents.savedview",
),
),
],
),
]

View File

@@ -0,0 +1,33 @@
# Generated by Django 3.1.4 on 2020-12-16 17:36
import django.db.models.functions.text
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
("documents", "1007_savedview_savedviewfilterrule"),
]
operations = [
migrations.AlterModelOptions(
name="correspondent",
options={"ordering": (django.db.models.functions.text.Lower("name"),)},
),
migrations.AlterModelOptions(
name="document",
options={"ordering": ("-created",)},
),
migrations.AlterModelOptions(
name="documenttype",
options={"ordering": (django.db.models.functions.text.Lower("name"),)},
),
migrations.AlterModelOptions(
name="savedview",
options={"ordering": (django.db.models.functions.text.Lower("name"),)},
),
migrations.AlterModelOptions(
name="tag",
options={"ordering": (django.db.models.functions.text.Lower("name"),)},
),
]

View File

@@ -0,0 +1,28 @@
# Generated by Django 3.1.4 on 2020-12-16 20:05
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
("documents", "1008_auto_20201216_1736"),
]
operations = [
migrations.AlterModelOptions(
name="correspondent",
options={"ordering": ("name",)},
),
migrations.AlterModelOptions(
name="documenttype",
options={"ordering": ("name",)},
),
migrations.AlterModelOptions(
name="savedview",
options={"ordering": ("name",)},
),
migrations.AlterModelOptions(
name="tag",
options={"ordering": ("name",)},
),
]

View File

@@ -0,0 +1,18 @@
# Generated by Django 3.1.4 on 2021-01-01 21:59
from django.db import migrations
from django.db import models
class Migration(migrations.Migration):
dependencies = [
("documents", "1009_auto_20201216_2005"),
]
operations = [
migrations.AlterField(
model_name="savedviewfilterrule",
name="value",
field=models.CharField(blank=True, max_length=128, null=True),
),
]

View File

@@ -0,0 +1,454 @@
# Generated by Django 3.1.4 on 2021-01-01 23:40
import django.db.models.deletion
import django.utils.timezone
from django.conf import settings
from django.db import migrations
from django.db import models
class Migration(migrations.Migration):
dependencies = [
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
("documents", "1010_auto_20210101_2159"),
]
operations = [
migrations.AlterModelOptions(
name="correspondent",
options={
"ordering": ("name",),
"verbose_name": "correspondent",
"verbose_name_plural": "correspondents",
},
),
migrations.AlterModelOptions(
name="document",
options={
"ordering": ("-created",),
"verbose_name": "document",
"verbose_name_plural": "documents",
},
),
migrations.AlterModelOptions(
name="documenttype",
options={
"verbose_name": "document type",
"verbose_name_plural": "document types",
},
),
migrations.AlterModelOptions(
name="log",
options={
"ordering": ("-created",),
"verbose_name": "log",
"verbose_name_plural": "logs",
},
),
migrations.AlterModelOptions(
name="savedview",
options={
"ordering": ("name",),
"verbose_name": "saved view",
"verbose_name_plural": "saved views",
},
),
migrations.AlterModelOptions(
name="savedviewfilterrule",
options={
"verbose_name": "filter rule",
"verbose_name_plural": "filter rules",
},
),
migrations.AlterModelOptions(
name="tag",
options={"verbose_name": "tag", "verbose_name_plural": "tags"},
),
migrations.AlterField(
model_name="correspondent",
name="is_insensitive",
field=models.BooleanField(default=True, verbose_name="is insensitive"),
),
migrations.AlterField(
model_name="correspondent",
name="match",
field=models.CharField(blank=True, max_length=256, verbose_name="match"),
),
migrations.AlterField(
model_name="correspondent",
name="matching_algorithm",
field=models.PositiveIntegerField(
choices=[
(1, "Any word"),
(2, "All words"),
(3, "Exact match"),
(4, "Regular expression"),
(5, "Fuzzy word"),
(6, "Automatic"),
],
default=1,
verbose_name="matching algorithm",
),
),
migrations.AlterField(
model_name="correspondent",
name="name",
field=models.CharField(max_length=128, unique=True, verbose_name="name"),
),
migrations.AlterField(
model_name="document",
name="added",
field=models.DateTimeField(
db_index=True,
default=django.utils.timezone.now,
editable=False,
verbose_name="added",
),
),
migrations.AlterField(
model_name="document",
name="archive_checksum",
field=models.CharField(
blank=True,
editable=False,
help_text="The checksum of the archived document.",
max_length=32,
null=True,
verbose_name="archive checksum",
),
),
migrations.AlterField(
model_name="document",
name="archive_serial_number",
field=models.IntegerField(
blank=True,
db_index=True,
help_text="The position of this document in your physical document archive.",
null=True,
unique=True,
verbose_name="archive serial number",
),
),
migrations.AlterField(
model_name="document",
name="checksum",
field=models.CharField(
editable=False,
help_text="The checksum of the original document.",
max_length=32,
unique=True,
verbose_name="checksum",
),
),
migrations.AlterField(
model_name="document",
name="content",
field=models.TextField(
blank=True,
help_text="The raw, text-only data of the document. This field is primarily used for searching.",
verbose_name="content",
),
),
migrations.AlterField(
model_name="document",
name="correspondent",
field=models.ForeignKey(
blank=True,
null=True,
on_delete=django.db.models.deletion.SET_NULL,
related_name="documents",
to="documents.correspondent",
verbose_name="correspondent",
),
),
migrations.AlterField(
model_name="document",
name="created",
field=models.DateTimeField(
db_index=True,
default=django.utils.timezone.now,
verbose_name="created",
),
),
migrations.AlterField(
model_name="document",
name="document_type",
field=models.ForeignKey(
blank=True,
null=True,
on_delete=django.db.models.deletion.SET_NULL,
related_name="documents",
to="documents.documenttype",
verbose_name="document type",
),
),
migrations.AlterField(
model_name="document",
name="filename",
field=models.FilePathField(
default=None,
editable=False,
help_text="Current filename in storage",
max_length=1024,
null=True,
verbose_name="filename",
),
),
migrations.AlterField(
model_name="document",
name="mime_type",
field=models.CharField(
editable=False,
max_length=256,
verbose_name="mime type",
),
),
migrations.AlterField(
model_name="document",
name="modified",
field=models.DateTimeField(
auto_now=True,
db_index=True,
verbose_name="modified",
),
),
migrations.AlterField(
model_name="document",
name="storage_type",
field=models.CharField(
choices=[
("unencrypted", "Unencrypted"),
("gpg", "Encrypted with GNU Privacy Guard"),
],
default="unencrypted",
editable=False,
max_length=11,
verbose_name="storage type",
),
),
migrations.AlterField(
model_name="document",
name="tags",
field=models.ManyToManyField(
blank=True,
related_name="documents",
to="documents.Tag",
verbose_name="tags",
),
),
migrations.AlterField(
model_name="document",
name="title",
field=models.CharField(
blank=True,
db_index=True,
max_length=128,
verbose_name="title",
),
),
migrations.AlterField(
model_name="documenttype",
name="is_insensitive",
field=models.BooleanField(default=True, verbose_name="is insensitive"),
),
migrations.AlterField(
model_name="documenttype",
name="match",
field=models.CharField(blank=True, max_length=256, verbose_name="match"),
),
migrations.AlterField(
model_name="documenttype",
name="matching_algorithm",
field=models.PositiveIntegerField(
choices=[
(1, "Any word"),
(2, "All words"),
(3, "Exact match"),
(4, "Regular expression"),
(5, "Fuzzy word"),
(6, "Automatic"),
],
default=1,
verbose_name="matching algorithm",
),
),
migrations.AlterField(
model_name="documenttype",
name="name",
field=models.CharField(max_length=128, unique=True, verbose_name="name"),
),
migrations.AlterField(
model_name="log",
name="created",
field=models.DateTimeField(auto_now_add=True, verbose_name="created"),
),
migrations.AlterField(
model_name="log",
name="group",
field=models.UUIDField(blank=True, null=True, verbose_name="group"),
),
migrations.AlterField(
model_name="log",
name="level",
field=models.PositiveIntegerField(
choices=[
(10, "debug"),
(20, "information"),
(30, "warning"),
(40, "error"),
(50, "critical"),
],
default=20,
verbose_name="level",
),
),
migrations.AlterField(
model_name="log",
name="message",
field=models.TextField(verbose_name="message"),
),
migrations.AlterField(
model_name="savedview",
name="name",
field=models.CharField(max_length=128, verbose_name="name"),
),
migrations.AlterField(
model_name="savedview",
name="show_in_sidebar",
field=models.BooleanField(verbose_name="show in sidebar"),
),
migrations.AlterField(
model_name="savedview",
name="show_on_dashboard",
field=models.BooleanField(verbose_name="show on dashboard"),
),
migrations.AlterField(
model_name="savedview",
name="sort_field",
field=models.CharField(max_length=128, verbose_name="sort field"),
),
migrations.AlterField(
model_name="savedview",
name="sort_reverse",
field=models.BooleanField(default=False, verbose_name="sort reverse"),
),
migrations.AlterField(
model_name="savedview",
name="user",
field=models.ForeignKey(
on_delete=django.db.models.deletion.CASCADE,
to=settings.AUTH_USER_MODEL,
verbose_name="user",
),
),
migrations.AlterField(
model_name="savedviewfilterrule",
name="rule_type",
field=models.PositiveIntegerField(
choices=[
(0, "title contains"),
(1, "content contains"),
(2, "ASN is"),
(3, "correspondent is"),
(4, "document type is"),
(5, "is in inbox"),
(6, "has tag"),
(7, "has any tag"),
(8, "created before"),
(9, "created after"),
(10, "created year is"),
(11, "created month is"),
(12, "created day is"),
(13, "added before"),
(14, "added after"),
(15, "modified before"),
(16, "modified after"),
(17, "does not have tag"),
],
verbose_name="rule type",
),
),
migrations.AlterField(
model_name="savedviewfilterrule",
name="saved_view",
field=models.ForeignKey(
on_delete=django.db.models.deletion.CASCADE,
related_name="filter_rules",
to="documents.savedview",
verbose_name="saved view",
),
),
migrations.AlterField(
model_name="savedviewfilterrule",
name="value",
field=models.CharField(
blank=True,
max_length=128,
null=True,
verbose_name="value",
),
),
migrations.AlterField(
model_name="tag",
name="colour",
field=models.PositiveIntegerField(
choices=[
(1, "#a6cee3"),
(2, "#1f78b4"),
(3, "#b2df8a"),
(4, "#33a02c"),
(5, "#fb9a99"),
(6, "#e31a1c"),
(7, "#fdbf6f"),
(8, "#ff7f00"),
(9, "#cab2d6"),
(10, "#6a3d9a"),
(11, "#b15928"),
(12, "#000000"),
(13, "#cccccc"),
],
default=1,
verbose_name="color",
),
),
migrations.AlterField(
model_name="tag",
name="is_inbox_tag",
field=models.BooleanField(
default=False,
help_text="Marks this tag as an inbox tag: All newly consumed documents will be tagged with inbox tags.",
verbose_name="is inbox tag",
),
),
migrations.AlterField(
model_name="tag",
name="is_insensitive",
field=models.BooleanField(default=True, verbose_name="is insensitive"),
),
migrations.AlterField(
model_name="tag",
name="match",
field=models.CharField(blank=True, max_length=256, verbose_name="match"),
),
migrations.AlterField(
model_name="tag",
name="matching_algorithm",
field=models.PositiveIntegerField(
choices=[
(1, "Any word"),
(2, "All words"),
(3, "Exact match"),
(4, "Regular expression"),
(5, "Fuzzy word"),
(6, "Automatic"),
],
default=1,
verbose_name="matching algorithm",
),
),
migrations.AlterField(
model_name="tag",
name="name",
field=models.CharField(max_length=128, unique=True, verbose_name="name"),
),
]

View File

@@ -0,0 +1,367 @@
# Generated by Django 3.1.6 on 2021-02-07 22:26
import datetime
import hashlib
import logging
import os
import shutil
from collections import defaultdict
from pathlib import Path
from time import sleep
import pathvalidate
from django.conf import settings
from django.db import migrations
from django.db import models
from django.template.defaultfilters import slugify
logger = logging.getLogger("paperless.migrations")
###############################################################################
# This is code copied straight paperless before the change.
###############################################################################
class defaultdictNoStr(defaultdict):
def __str__(self): # pragma: no cover
raise ValueError("Don't use {tags} directly.")
def many_to_dictionary(field): # pragma: no cover
# Converts ManyToManyField to dictionary by assuming, that field
# entries contain an _ or - which will be used as a delimiter
mydictionary = dict()
for index, t in enumerate(field.all()):
# Populate tag names by index
mydictionary[index] = slugify(t.name)
# Find delimiter
delimiter = t.name.find("_")
if delimiter == -1:
delimiter = t.name.find("-")
if delimiter == -1:
continue
key = t.name[:delimiter]
value = t.name[delimiter + 1 :]
mydictionary[slugify(key)] = slugify(value)
return mydictionary
def archive_name_from_filename(filename: Path) -> Path:
return Path(filename.stem + ".pdf")
def archive_path_old(doc) -> Path:
if doc.filename:
fname = archive_name_from_filename(Path(doc.filename))
else:
fname = Path(f"{doc.pk:07}.pdf")
return settings.ARCHIVE_DIR / fname
STORAGE_TYPE_GPG = "gpg"
def archive_path_new(doc) -> Path | None:
if doc.archive_filename is not None:
return settings.ARCHIVE_DIR / doc.archive_filename
else:
return None
def source_path(doc) -> Path:
if doc.filename:
fname = doc.filename
else:
fname = f"{doc.pk:07}{doc.file_type}"
if doc.storage_type == STORAGE_TYPE_GPG:
fname = Path(str(fname) + ".gpg") # pragma: no cover
return settings.ORIGINALS_DIR / fname
def generate_unique_filename(doc, *, archive_filename=False):
if archive_filename:
old_filename = doc.archive_filename
root = settings.ARCHIVE_DIR
else:
old_filename = doc.filename
root = settings.ORIGINALS_DIR
counter = 0
while True:
new_filename = generate_filename(
doc,
counter=counter,
archive_filename=archive_filename,
)
if new_filename == old_filename:
# still the same as before.
return new_filename
if (root / new_filename).exists():
counter += 1
else:
return new_filename
def generate_filename(doc, *, counter=0, append_gpg=True, archive_filename=False):
path = ""
try:
if settings.FILENAME_FORMAT is not None:
tags = defaultdictNoStr(lambda: slugify(None), many_to_dictionary(doc.tags))
tag_list = pathvalidate.sanitize_filename(
",".join(sorted([tag.name for tag in doc.tags.all()])),
replacement_text="-",
)
if doc.correspondent:
correspondent = pathvalidate.sanitize_filename(
doc.correspondent.name,
replacement_text="-",
)
else:
correspondent = "none"
if doc.document_type:
document_type = pathvalidate.sanitize_filename(
doc.document_type.name,
replacement_text="-",
)
else:
document_type = "none"
path = settings.FILENAME_FORMAT.format(
title=pathvalidate.sanitize_filename(doc.title, replacement_text="-"),
correspondent=correspondent,
document_type=document_type,
created=datetime.date.isoformat(doc.created),
created_year=doc.created.year if doc.created else "none",
created_month=f"{doc.created.month:02}" if doc.created else "none",
created_day=f"{doc.created.day:02}" if doc.created else "none",
added=datetime.date.isoformat(doc.added),
added_year=doc.added.year if doc.added else "none",
added_month=f"{doc.added.month:02}" if doc.added else "none",
added_day=f"{doc.added.day:02}" if doc.added else "none",
tags=tags,
tag_list=tag_list,
).strip()
path = path.strip(os.sep)
except (ValueError, KeyError, IndexError):
logger.warning(
f"Invalid PAPERLESS_FILENAME_FORMAT: "
f"{settings.FILENAME_FORMAT}, falling back to default",
)
counter_str = f"_{counter:02}" if counter else ""
filetype_str = ".pdf" if archive_filename else doc.file_type
if len(path) > 0:
filename = f"{path}{counter_str}{filetype_str}"
else:
filename = f"{doc.pk:07}{counter_str}{filetype_str}"
# Append .gpg for encrypted files
if append_gpg and doc.storage_type == STORAGE_TYPE_GPG:
filename += ".gpg"
return filename
###############################################################################
# This code performs bidirection archive file transformation.
###############################################################################
def parse_wrapper(parser, path, mime_type, file_name):
# this is here so that I can mock this out for testing.
parser.parse(path, mime_type, file_name)
def create_archive_version(doc, retry_count=3):
from documents.parsers import DocumentParser
from documents.parsers import ParseError
from documents.parsers import get_parser_class_for_mime_type
logger.info(f"Regenerating archive document for document ID:{doc.id}")
parser_class = get_parser_class_for_mime_type(doc.mime_type)
for try_num in range(retry_count):
parser: DocumentParser = parser_class(None, None)
try:
parse_wrapper(
parser,
source_path(doc),
doc.mime_type,
Path(doc.filename).name,
)
doc.content = parser.get_text()
if parser.get_archive_path() and Path(parser.get_archive_path()).is_file():
doc.archive_filename = generate_unique_filename(
doc,
archive_filename=True,
)
with Path(parser.get_archive_path()).open("rb") as f:
doc.archive_checksum = hashlib.md5(f.read()).hexdigest()
archive_path_new(doc).parent.mkdir(parents=True, exist_ok=True)
shutil.copy2(parser.get_archive_path(), archive_path_new(doc))
else:
doc.archive_checksum = None
logger.error(
f"Parser did not return an archive document for document "
f"ID:{doc.id}. Removing archive document.",
)
doc.save()
return
except ParseError:
if try_num + 1 == retry_count:
logger.exception(
f"Unable to regenerate archive document for ID:{doc.id}. You "
f"need to invoke the document_archiver management command "
f"manually for that document.",
)
doc.archive_checksum = None
doc.save()
return
else:
# This is mostly here for the tika parser in docker
# environments. The servers for parsing need to come up first,
# and the docker setup doesn't ensure that tika is running
# before attempting migrations.
logger.error("Parse error, will try again in 5 seconds...")
sleep(5)
finally:
parser.cleanup()
def move_old_to_new_locations(apps, schema_editor):
Document = apps.get_model("documents", "Document")
affected_document_ids = set()
old_archive_path_to_id = {}
# check for documents that have incorrect archive versions
for doc in Document.objects.filter(archive_checksum__isnull=False):
old_path = archive_path_old(doc)
if old_path in old_archive_path_to_id:
affected_document_ids.add(doc.id)
affected_document_ids.add(old_archive_path_to_id[old_path])
else:
old_archive_path_to_id[old_path] = doc.id
# check that archive files of all unaffected documents are in place
for doc in Document.objects.filter(archive_checksum__isnull=False):
old_path = archive_path_old(doc)
if doc.id not in affected_document_ids and not old_path.is_file():
raise ValueError(
f"Archived document ID:{doc.id} does not exist at: {old_path}",
)
# check that we can regenerate affected archive versions
for doc_id in affected_document_ids:
from documents.parsers import get_parser_class_for_mime_type
doc = Document.objects.get(id=doc_id)
parser_class = get_parser_class_for_mime_type(doc.mime_type)
if not parser_class:
raise ValueError(
f"Document ID:{doc.id} has an invalid archived document, "
f"but no parsers are available. Cannot migrate.",
)
for doc in Document.objects.filter(archive_checksum__isnull=False):
if doc.id in affected_document_ids:
old_path = archive_path_old(doc)
# remove affected archive versions
if old_path.is_file():
logger.debug(f"Removing {old_path}")
old_path.unlink()
else:
# Set archive path for unaffected files
doc.archive_filename = archive_name_from_filename(Path(doc.filename))
Document.objects.filter(id=doc.id).update(
archive_filename=doc.archive_filename,
)
# regenerate archive documents
for doc_id in affected_document_ids:
doc = Document.objects.get(id=doc_id)
create_archive_version(doc)
def move_new_to_old_locations(apps, schema_editor):
Document = apps.get_model("documents", "Document")
old_archive_paths = set()
for doc in Document.objects.filter(archive_checksum__isnull=False):
new_archive_path = archive_path_new(doc)
old_archive_path = archive_path_old(doc)
if old_archive_path in old_archive_paths:
raise ValueError(
f"Cannot migrate: Archive file name {old_archive_path} of "
f"document {doc.filename} would clash with another archive "
f"filename.",
)
old_archive_paths.add(old_archive_path)
if new_archive_path != old_archive_path and old_archive_path.is_file():
raise ValueError(
f"Cannot migrate: Cannot move {new_archive_path} to "
f"{old_archive_path}: file already exists.",
)
for doc in Document.objects.filter(archive_checksum__isnull=False):
new_archive_path = archive_path_new(doc)
old_archive_path = archive_path_old(doc)
if new_archive_path != old_archive_path:
logger.debug(f"Moving {new_archive_path} to {old_archive_path}")
shutil.move(new_archive_path, old_archive_path)
class Migration(migrations.Migration):
dependencies = [
("documents", "1011_auto_20210101_2340"),
]
operations = [
migrations.AddField(
model_name="document",
name="archive_filename",
field=models.FilePathField(
default=None,
editable=False,
help_text="Current archive filename in storage",
max_length=1024,
null=True,
unique=True,
verbose_name="archive filename",
),
),
migrations.AlterField(
model_name="document",
name="filename",
field=models.FilePathField(
default=None,
editable=False,
help_text="Current filename in storage",
max_length=1024,
null=True,
unique=True,
verbose_name="filename",
),
),
migrations.RunPython(move_old_to_new_locations, move_new_to_old_locations),
]

View File

@@ -0,0 +1,74 @@
# Generated by Django 3.1.4 on 2020-12-02 21:43
from django.db import migrations
from django.db import models
COLOURS_OLD = {
1: "#a6cee3",
2: "#1f78b4",
3: "#b2df8a",
4: "#33a02c",
5: "#fb9a99",
6: "#e31a1c",
7: "#fdbf6f",
8: "#ff7f00",
9: "#cab2d6",
10: "#6a3d9a",
11: "#b15928",
12: "#000000",
13: "#cccccc",
}
def forward(apps, schema_editor):
Tag = apps.get_model("documents", "Tag")
for tag in Tag.objects.all():
colour_old_id = tag.colour_old
rgb = COLOURS_OLD[colour_old_id]
tag.color = rgb
tag.save()
def reverse(apps, schema_editor):
Tag = apps.get_model("documents", "Tag")
def _get_colour_id(rdb):
for idx, rdbx in COLOURS_OLD.items():
if rdbx == rdb:
return idx
# Return colour 1 if we can't match anything
return 1
for tag in Tag.objects.all():
colour_id = _get_colour_id(tag.color)
tag.colour_old = colour_id
tag.save()
class Migration(migrations.Migration):
dependencies = [
("documents", "1012_fix_archive_files"),
]
operations = [
migrations.RenameField(
model_name="tag",
old_name="colour",
new_name="colour_old",
),
migrations.AddField(
model_name="tag",
name="color",
field=models.CharField(
default="#a6cee3",
max_length=7,
verbose_name="color",
),
),
migrations.RunPython(forward, reverse),
migrations.RemoveField(
model_name="tag",
name="colour_old",
),
]

View File

@@ -0,0 +1,42 @@
# Generated by Django 3.1.7 on 2021-02-28 15:14
from django.db import migrations
from django.db import models
class Migration(migrations.Migration):
dependencies = [
("documents", "1013_migrate_tag_colour"),
]
operations = [
migrations.AlterField(
model_name="savedviewfilterrule",
name="rule_type",
field=models.PositiveIntegerField(
choices=[
(0, "title contains"),
(1, "content contains"),
(2, "ASN is"),
(3, "correspondent is"),
(4, "document type is"),
(5, "is in inbox"),
(6, "has tag"),
(7, "has any tag"),
(8, "created before"),
(9, "created after"),
(10, "created year is"),
(11, "created month is"),
(12, "created day is"),
(13, "added before"),
(14, "added after"),
(15, "modified before"),
(16, "modified after"),
(17, "does not have tag"),
(18, "does not have ASN"),
(19, "title or content contains"),
],
verbose_name="rule type",
),
),
]

View File

@@ -0,0 +1,27 @@
# Generated by Django 3.1.7 on 2021-04-04 18:28
import logging
from django.db import migrations
logger = logging.getLogger("paperless.migrations")
def remove_null_characters(apps, schema_editor):
Document = apps.get_model("documents", "Document")
for doc in Document.objects.all():
content: str = doc.content
if "\0" in content:
logger.info(f"Removing null characters from document {doc}...")
doc.content = content.replace("\0", " ")
doc.save()
class Migration(migrations.Migration):
dependencies = [
("documents", "1014_auto_20210228_1614"),
]
operations = [
migrations.RunPython(remove_null_characters, migrations.RunPython.noop),
]

View File

@@ -0,0 +1,54 @@
# Generated by Django 3.1.7 on 2021-03-17 12:51
from django.db import migrations
from django.db import models
class Migration(migrations.Migration):
dependencies = [
("documents", "1015_remove_null_characters"),
]
operations = [
migrations.AlterField(
model_name="savedview",
name="sort_field",
field=models.CharField(
blank=True,
max_length=128,
null=True,
verbose_name="sort field",
),
),
migrations.AlterField(
model_name="savedviewfilterrule",
name="rule_type",
field=models.PositiveIntegerField(
choices=[
(0, "title contains"),
(1, "content contains"),
(2, "ASN is"),
(3, "correspondent is"),
(4, "document type is"),
(5, "is in inbox"),
(6, "has tag"),
(7, "has any tag"),
(8, "created before"),
(9, "created after"),
(10, "created year is"),
(11, "created month is"),
(12, "created day is"),
(13, "added before"),
(14, "added after"),
(15, "modified before"),
(16, "modified after"),
(17, "does not have tag"),
(18, "does not have ASN"),
(19, "title or content contains"),
(20, "fulltext query"),
(21, "more like this"),
],
verbose_name="rule type",
),
),
]

View File

@@ -0,0 +1,190 @@
# Generated by Django 4.2.13 on 2024-06-28 18:09
import django.db.models.deletion
from django.conf import settings
from django.db import migrations
from django.db import models
class Migration(migrations.Migration):
replaces = [
("documents", "1016_auto_20210317_1351"),
("documents", "1017_alter_savedviewfilterrule_rule_type"),
("documents", "1018_alter_savedviewfilterrule_value"),
("documents", "1019_uisettings"),
("documents", "1019_storagepath_document_storage_path"),
("documents", "1020_merge_20220518_1839"),
]
dependencies = [
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
("documents", "1015_remove_null_characters"),
]
operations = [
migrations.AlterField(
model_name="savedview",
name="sort_field",
field=models.CharField(
blank=True,
max_length=128,
null=True,
verbose_name="sort field",
),
),
migrations.AlterField(
model_name="savedviewfilterrule",
name="rule_type",
field=models.PositiveIntegerField(
choices=[
(0, "title contains"),
(1, "content contains"),
(2, "ASN is"),
(3, "correspondent is"),
(4, "document type is"),
(5, "is in inbox"),
(6, "has tag"),
(7, "has any tag"),
(8, "created before"),
(9, "created after"),
(10, "created year is"),
(11, "created month is"),
(12, "created day is"),
(13, "added before"),
(14, "added after"),
(15, "modified before"),
(16, "modified after"),
(17, "does not have tag"),
(18, "does not have ASN"),
(19, "title or content contains"),
(20, "fulltext query"),
(21, "more like this"),
],
verbose_name="rule type",
),
),
migrations.AlterField(
model_name="savedviewfilterrule",
name="rule_type",
field=models.PositiveIntegerField(
choices=[
(0, "title contains"),
(1, "content contains"),
(2, "ASN is"),
(3, "correspondent is"),
(4, "document type is"),
(5, "is in inbox"),
(6, "has tag"),
(7, "has any tag"),
(8, "created before"),
(9, "created after"),
(10, "created year is"),
(11, "created month is"),
(12, "created day is"),
(13, "added before"),
(14, "added after"),
(15, "modified before"),
(16, "modified after"),
(17, "does not have tag"),
(18, "does not have ASN"),
(19, "title or content contains"),
(20, "fulltext query"),
(21, "more like this"),
(22, "has tags in"),
],
verbose_name="rule type",
),
),
migrations.AlterField(
model_name="savedviewfilterrule",
name="value",
field=models.CharField(
blank=True,
max_length=255,
null=True,
verbose_name="value",
),
),
migrations.CreateModel(
name="UiSettings",
fields=[
(
"id",
models.AutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
("settings", models.JSONField(null=True)),
(
"user",
models.OneToOneField(
on_delete=django.db.models.deletion.CASCADE,
related_name="ui_settings",
to=settings.AUTH_USER_MODEL,
),
),
],
),
migrations.CreateModel(
name="StoragePath",
fields=[
(
"id",
models.AutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
(
"name",
models.CharField(max_length=128, unique=True, verbose_name="name"),
),
(
"match",
models.CharField(blank=True, max_length=256, verbose_name="match"),
),
(
"matching_algorithm",
models.PositiveIntegerField(
choices=[
(1, "Any word"),
(2, "All words"),
(3, "Exact match"),
(4, "Regular expression"),
(5, "Fuzzy word"),
(6, "Automatic"),
],
default=1,
verbose_name="matching algorithm",
),
),
(
"is_insensitive",
models.BooleanField(default=True, verbose_name="is insensitive"),
),
("path", models.CharField(max_length=512, verbose_name="path")),
],
options={
"verbose_name": "storage path",
"verbose_name_plural": "storage paths",
"ordering": ("name",),
},
),
migrations.AddField(
model_name="document",
name="storage_path",
field=models.ForeignKey(
blank=True,
null=True,
on_delete=django.db.models.deletion.SET_NULL,
related_name="documents",
to="documents.storagepath",
verbose_name="storage path",
),
),
]

View File

@@ -0,0 +1,45 @@
# Generated by Django 3.2.12 on 2022-03-17 11:59
from django.db import migrations
from django.db import models
class Migration(migrations.Migration):
dependencies = [
("documents", "1016_auto_20210317_1351"),
]
operations = [
migrations.AlterField(
model_name="savedviewfilterrule",
name="rule_type",
field=models.PositiveIntegerField(
choices=[
(0, "title contains"),
(1, "content contains"),
(2, "ASN is"),
(3, "correspondent is"),
(4, "document type is"),
(5, "is in inbox"),
(6, "has tag"),
(7, "has any tag"),
(8, "created before"),
(9, "created after"),
(10, "created year is"),
(11, "created month is"),
(12, "created day is"),
(13, "added before"),
(14, "added after"),
(15, "modified before"),
(16, "modified after"),
(17, "does not have tag"),
(18, "does not have ASN"),
(19, "title or content contains"),
(20, "fulltext query"),
(21, "more like this"),
(22, "has tags in"),
],
verbose_name="rule type",
),
),
]

View File

@@ -0,0 +1,23 @@
# Generated by Django 4.0.3 on 2022-04-01 22:50
from django.db import migrations
from django.db import models
class Migration(migrations.Migration):
dependencies = [
("documents", "1017_alter_savedviewfilterrule_rule_type"),
]
operations = [
migrations.AlterField(
model_name="savedviewfilterrule",
name="value",
field=models.CharField(
blank=True,
max_length=255,
null=True,
verbose_name="value",
),
),
]

View File

@@ -0,0 +1,73 @@
# Generated by Django 4.0.4 on 2022-05-02 15:56
import django.db.models.deletion
from django.db import migrations
from django.db import models
class Migration(migrations.Migration):
dependencies = [
("documents", "1018_alter_savedviewfilterrule_value"),
]
operations = [
migrations.CreateModel(
name="StoragePath",
fields=[
(
"id",
models.AutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
(
"name",
models.CharField(max_length=128, unique=True, verbose_name="name"),
),
(
"match",
models.CharField(blank=True, max_length=256, verbose_name="match"),
),
(
"matching_algorithm",
models.PositiveIntegerField(
choices=[
(1, "Any word"),
(2, "All words"),
(3, "Exact match"),
(4, "Regular expression"),
(5, "Fuzzy word"),
(6, "Automatic"),
],
default=1,
verbose_name="matching algorithm",
),
),
(
"is_insensitive",
models.BooleanField(default=True, verbose_name="is insensitive"),
),
("path", models.CharField(max_length=512, verbose_name="path")),
],
options={
"verbose_name": "storage path",
"verbose_name_plural": "storage paths",
"ordering": ("name",),
},
),
migrations.AddField(
model_name="document",
name="storage_path",
field=models.ForeignKey(
blank=True,
null=True,
on_delete=django.db.models.deletion.SET_NULL,
related_name="documents",
to="documents.storagepath",
verbose_name="storage path",
),
),
]

View File

@@ -0,0 +1,39 @@
# Generated by Django 4.0.4 on 2022-05-07 05:10
import django.db.models.deletion
from django.conf import settings
from django.db import migrations
from django.db import models
class Migration(migrations.Migration):
dependencies = [
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
("documents", "1018_alter_savedviewfilterrule_value"),
]
operations = [
migrations.CreateModel(
name="UiSettings",
fields=[
(
"id",
models.AutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
("settings", models.JSONField(null=True)),
(
"user",
models.OneToOneField(
on_delete=django.db.models.deletion.CASCADE,
related_name="ui_settings",
to=settings.AUTH_USER_MODEL,
),
),
],
),
]

View File

@@ -0,0 +1,12 @@
# Generated by Django 4.0.4 on 2022-05-18 18:39
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
("documents", "1019_storagepath_document_storage_path"),
("documents", "1019_uisettings"),
]
operations = []

View File

@@ -0,0 +1,104 @@
# Generated by Django 4.0.5 on 2022-06-11 15:40
import logging
import multiprocessing.pool
import shutil
import tempfile
import time
from pathlib import Path
from django.conf import settings
from django.db import migrations
from documents.parsers import run_convert
logger = logging.getLogger("paperless.migrations")
def _do_convert(work_package):
existing_thumbnail, converted_thumbnail = work_package
try:
logger.info(f"Converting thumbnail: {existing_thumbnail}")
# Run actual conversion
run_convert(
density=300,
scale="500x5000>",
alpha="remove",
strip=True,
trim=False,
auto_orient=True,
input_file=f"{existing_thumbnail}[0]",
output_file=str(converted_thumbnail),
)
# Copy newly created thumbnail to thumbnail directory
shutil.copy(converted_thumbnail, existing_thumbnail.parent)
# Remove the PNG version
existing_thumbnail.unlink()
logger.info(
"Conversion to WebP completed, "
f"replaced {existing_thumbnail.name} with {converted_thumbnail.name}",
)
except Exception as e:
logger.error(f"Error converting thumbnail (existing file unchanged): {e}")
def _convert_thumbnails_to_webp(apps, schema_editor):
start = time.time()
with tempfile.TemporaryDirectory() as tempdir:
work_packages = []
for file in Path(settings.THUMBNAIL_DIR).glob("*.png"):
existing_thumbnail = file.resolve()
# Change the existing filename suffix from png to webp
converted_thumbnail_name = existing_thumbnail.with_suffix(
".webp",
).name
# Create the expected output filename in the tempdir
converted_thumbnail = (
Path(tempdir) / Path(converted_thumbnail_name)
).resolve()
# Package up the necessary info
work_packages.append(
(existing_thumbnail, converted_thumbnail),
)
if work_packages:
logger.info(
"\n\n"
" This is a one-time only migration to convert thumbnails for all of your\n"
" documents into WebP format. If you have a lot of documents though, \n"
" this may take a while, so a coffee break may be in order."
"\n",
)
with multiprocessing.pool.Pool(
processes=min(multiprocessing.cpu_count(), 4),
maxtasksperchild=4,
) as pool:
pool.map(_do_convert, work_packages)
end = time.time()
duration = end - start
logger.info(f"Conversion completed in {duration:.3f}s")
class Migration(migrations.Migration):
dependencies = [
("documents", "1020_merge_20220518_1839"),
]
operations = [
migrations.RunPython(
code=_convert_thumbnails_to_webp,
reverse_code=migrations.RunPython.noop,
),
]

View File

@@ -0,0 +1,52 @@
# Generated by Django 4.0.4 on 2022-05-23 07:14
import django.db.models.deletion
from django.db import migrations
from django.db import models
class Migration(migrations.Migration):
dependencies = [
("documents", "1021_webp_thumbnail_conversion"),
]
operations = [
migrations.CreateModel(
name="PaperlessTask",
fields=[
(
"id",
models.AutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
("task_id", models.CharField(max_length=128)),
("name", models.CharField(max_length=256, null=True)),
(
"created",
models.DateTimeField(auto_now=True, verbose_name="created"),
),
(
"started",
models.DateTimeField(null=True, verbose_name="started"),
),
("acknowledged", models.BooleanField(default=False)),
(
"attempted_task",
models.OneToOneField(
blank=True,
null=True,
on_delete=django.db.models.deletion.CASCADE,
related_name="attempted_task",
# This is a dummy field, 1026 will fix up the column
# This manual change is required, as django doesn't django doesn't really support
# removing an app which has migration deps like this
to="documents.document",
),
),
],
),
]

View File

@@ -0,0 +1,668 @@
# Generated by Django 4.2.13 on 2024-06-28 18:10
import django.core.validators
import django.db.models.deletion
import django.utils.timezone
from django.conf import settings
from django.db import migrations
from django.db import models
class Migration(migrations.Migration):
replaces = [
("documents", "1022_paperlesstask"),
("documents", "1023_add_comments"),
("documents", "1024_document_original_filename"),
("documents", "1025_alter_savedviewfilterrule_rule_type"),
("documents", "1026_transition_to_celery"),
("documents", "1027_remove_paperlesstask_attempted_task_and_more"),
("documents", "1028_remove_paperlesstask_task_args_and_more"),
("documents", "1029_alter_document_archive_serial_number"),
("documents", "1030_alter_paperlesstask_task_file_name"),
("documents", "1031_remove_savedview_user_correspondent_owner_and_more"),
("documents", "1032_alter_correspondent_matching_algorithm_and_more"),
("documents", "1033_alter_documenttype_options_alter_tag_options_and_more"),
("documents", "1034_alter_savedviewfilterrule_rule_type"),
("documents", "1035_rename_comment_note"),
("documents", "1036_alter_savedviewfilterrule_rule_type"),
]
dependencies = [
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
("django_celery_results", "0011_taskresult_periodic_task_name"),
("documents", "1021_webp_thumbnail_conversion"),
]
operations = [
migrations.CreateModel(
name="Comment",
fields=[
(
"id",
models.AutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
(
"comment",
models.TextField(
blank=True,
help_text="Comment for the document",
verbose_name="content",
),
),
(
"created",
models.DateTimeField(
db_index=True,
default=django.utils.timezone.now,
verbose_name="created",
),
),
(
"document",
models.ForeignKey(
blank=True,
null=True,
on_delete=django.db.models.deletion.CASCADE,
related_name="documents",
to="documents.document",
verbose_name="document",
),
),
(
"user",
models.ForeignKey(
blank=True,
null=True,
on_delete=django.db.models.deletion.SET_NULL,
related_name="users",
to=settings.AUTH_USER_MODEL,
verbose_name="user",
),
),
],
options={
"verbose_name": "comment",
"verbose_name_plural": "comments",
"ordering": ("created",),
},
),
migrations.AddField(
model_name="document",
name="original_filename",
field=models.CharField(
default=None,
editable=False,
help_text="The original name of the file when it was uploaded",
max_length=1024,
null=True,
verbose_name="original filename",
),
),
migrations.AlterField(
model_name="savedviewfilterrule",
name="rule_type",
field=models.PositiveIntegerField(
choices=[
(0, "title contains"),
(1, "content contains"),
(2, "ASN is"),
(3, "correspondent is"),
(4, "document type is"),
(5, "is in inbox"),
(6, "has tag"),
(7, "has any tag"),
(8, "created before"),
(9, "created after"),
(10, "created year is"),
(11, "created month is"),
(12, "created day is"),
(13, "added before"),
(14, "added after"),
(15, "modified before"),
(16, "modified after"),
(17, "does not have tag"),
(18, "does not have ASN"),
(19, "title or content contains"),
(20, "fulltext query"),
(21, "more like this"),
(22, "has tags in"),
(23, "ASN greater than"),
(24, "ASN less than"),
(25, "storage path is"),
],
verbose_name="rule type",
),
),
migrations.CreateModel(
name="PaperlessTask",
fields=[
(
"id",
models.AutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
("task_id", models.CharField(max_length=128)),
("acknowledged", models.BooleanField(default=False)),
(
"attempted_task",
models.OneToOneField(
blank=True,
null=True,
on_delete=django.db.models.deletion.CASCADE,
related_name="attempted_task",
to="django_celery_results.taskresult",
),
),
],
),
migrations.RunSQL(
sql="DROP TABLE IF EXISTS django_q_ormq",
reverse_sql="",
),
migrations.RunSQL(
sql="DROP TABLE IF EXISTS django_q_schedule",
reverse_sql="",
),
migrations.RunSQL(
sql="DROP TABLE IF EXISTS django_q_task",
reverse_sql="",
),
migrations.RemoveField(
model_name="paperlesstask",
name="attempted_task",
),
migrations.AddField(
model_name="paperlesstask",
name="date_created",
field=models.DateTimeField(
default=django.utils.timezone.now,
help_text="Datetime field when the task result was created in UTC",
null=True,
verbose_name="Created DateTime",
),
),
migrations.AddField(
model_name="paperlesstask",
name="date_done",
field=models.DateTimeField(
default=None,
help_text="Datetime field when the task was completed in UTC",
null=True,
verbose_name="Completed DateTime",
),
),
migrations.AddField(
model_name="paperlesstask",
name="date_started",
field=models.DateTimeField(
default=None,
help_text="Datetime field when the task was started in UTC",
null=True,
verbose_name="Started DateTime",
),
),
migrations.AddField(
model_name="paperlesstask",
name="result",
field=models.TextField(
default=None,
help_text="The data returned by the task",
null=True,
verbose_name="Result Data",
),
),
migrations.AddField(
model_name="paperlesstask",
name="status",
field=models.CharField(
choices=[
("FAILURE", "FAILURE"),
("PENDING", "PENDING"),
("RECEIVED", "RECEIVED"),
("RETRY", "RETRY"),
("REVOKED", "REVOKED"),
("STARTED", "STARTED"),
("SUCCESS", "SUCCESS"),
],
default="PENDING",
help_text="Current state of the task being run",
max_length=30,
verbose_name="Task State",
),
),
migrations.AddField(
model_name="paperlesstask",
name="task_name",
field=models.CharField(
help_text="Name of the Task which was run",
max_length=255,
null=True,
verbose_name="Task Name",
),
),
migrations.AlterField(
model_name="paperlesstask",
name="acknowledged",
field=models.BooleanField(
default=False,
help_text="If the task is acknowledged via the frontend or API",
verbose_name="Acknowledged",
),
),
migrations.AlterField(
model_name="paperlesstask",
name="task_id",
field=models.CharField(
help_text="Celery ID for the Task that was run",
max_length=255,
unique=True,
verbose_name="Task ID",
),
),
migrations.AlterField(
model_name="document",
name="archive_serial_number",
field=models.PositiveIntegerField(
blank=True,
db_index=True,
help_text="The position of this document in your physical document archive.",
null=True,
unique=True,
validators=[
django.core.validators.MaxValueValidator(4294967295),
django.core.validators.MinValueValidator(0),
],
verbose_name="archive serial number",
),
),
migrations.AddField(
model_name="paperlesstask",
name="task_file_name",
field=models.CharField(
help_text="Name of the file which the Task was run for",
max_length=255,
null=True,
verbose_name="Task Filename",
),
),
migrations.RenameField(
model_name="savedview",
old_name="user",
new_name="owner",
),
migrations.AlterField(
model_name="savedview",
name="owner",
field=models.ForeignKey(
blank=True,
null=True,
on_delete=django.db.models.deletion.SET_NULL,
to=settings.AUTH_USER_MODEL,
verbose_name="owner",
),
),
migrations.AddField(
model_name="correspondent",
name="owner",
field=models.ForeignKey(
blank=True,
null=True,
on_delete=django.db.models.deletion.SET_NULL,
to=settings.AUTH_USER_MODEL,
verbose_name="owner",
),
),
migrations.AddField(
model_name="document",
name="owner",
field=models.ForeignKey(
blank=True,
null=True,
on_delete=django.db.models.deletion.SET_NULL,
to=settings.AUTH_USER_MODEL,
verbose_name="owner",
),
),
migrations.AddField(
model_name="documenttype",
name="owner",
field=models.ForeignKey(
blank=True,
null=True,
on_delete=django.db.models.deletion.SET_NULL,
to=settings.AUTH_USER_MODEL,
verbose_name="owner",
),
),
migrations.AddField(
model_name="storagepath",
name="owner",
field=models.ForeignKey(
blank=True,
null=True,
on_delete=django.db.models.deletion.SET_NULL,
to=settings.AUTH_USER_MODEL,
verbose_name="owner",
),
),
migrations.AddField(
model_name="tag",
name="owner",
field=models.ForeignKey(
blank=True,
null=True,
on_delete=django.db.models.deletion.SET_NULL,
to=settings.AUTH_USER_MODEL,
verbose_name="owner",
),
),
migrations.AlterField(
model_name="correspondent",
name="matching_algorithm",
field=models.PositiveIntegerField(
choices=[
(0, "None"),
(1, "Any word"),
(2, "All words"),
(3, "Exact match"),
(4, "Regular expression"),
(5, "Fuzzy word"),
(6, "Automatic"),
],
default=1,
verbose_name="matching algorithm",
),
),
migrations.AlterField(
model_name="documenttype",
name="matching_algorithm",
field=models.PositiveIntegerField(
choices=[
(0, "None"),
(1, "Any word"),
(2, "All words"),
(3, "Exact match"),
(4, "Regular expression"),
(5, "Fuzzy word"),
(6, "Automatic"),
],
default=1,
verbose_name="matching algorithm",
),
),
migrations.AlterField(
model_name="storagepath",
name="matching_algorithm",
field=models.PositiveIntegerField(
choices=[
(0, "None"),
(1, "Any word"),
(2, "All words"),
(3, "Exact match"),
(4, "Regular expression"),
(5, "Fuzzy word"),
(6, "Automatic"),
],
default=1,
verbose_name="matching algorithm",
),
),
migrations.AlterField(
model_name="tag",
name="matching_algorithm",
field=models.PositiveIntegerField(
choices=[
(0, "None"),
(1, "Any word"),
(2, "All words"),
(3, "Exact match"),
(4, "Regular expression"),
(5, "Fuzzy word"),
(6, "Automatic"),
],
default=1,
verbose_name="matching algorithm",
),
),
migrations.AlterModelOptions(
name="documenttype",
options={
"ordering": ("name",),
"verbose_name": "document type",
"verbose_name_plural": "document types",
},
),
migrations.AlterModelOptions(
name="tag",
options={
"ordering": ("name",),
"verbose_name": "tag",
"verbose_name_plural": "tags",
},
),
migrations.AlterField(
model_name="correspondent",
name="name",
field=models.CharField(max_length=128, verbose_name="name"),
),
migrations.AlterField(
model_name="documenttype",
name="name",
field=models.CharField(max_length=128, verbose_name="name"),
),
migrations.AlterField(
model_name="storagepath",
name="name",
field=models.CharField(max_length=128, verbose_name="name"),
),
migrations.AlterField(
model_name="tag",
name="name",
field=models.CharField(max_length=128, verbose_name="name"),
),
migrations.AddConstraint(
model_name="correspondent",
constraint=models.UniqueConstraint(
fields=("name", "owner"),
name="documents_correspondent_unique_name_owner",
),
),
migrations.AddConstraint(
model_name="correspondent",
constraint=models.UniqueConstraint(
condition=models.Q(("owner__isnull", True)),
fields=("name",),
name="documents_correspondent_name_uniq",
),
),
migrations.AddConstraint(
model_name="documenttype",
constraint=models.UniqueConstraint(
fields=("name", "owner"),
name="documents_documenttype_unique_name_owner",
),
),
migrations.AddConstraint(
model_name="documenttype",
constraint=models.UniqueConstraint(
condition=models.Q(("owner__isnull", True)),
fields=("name",),
name="documents_documenttype_name_uniq",
),
),
migrations.AddConstraint(
model_name="storagepath",
constraint=models.UniqueConstraint(
fields=("name", "owner"),
name="documents_storagepath_unique_name_owner",
),
),
migrations.AddConstraint(
model_name="storagepath",
constraint=models.UniqueConstraint(
condition=models.Q(("owner__isnull", True)),
fields=("name",),
name="documents_storagepath_name_uniq",
),
),
migrations.AddConstraint(
model_name="tag",
constraint=models.UniqueConstraint(
fields=("name", "owner"),
name="documents_tag_unique_name_owner",
),
),
migrations.AddConstraint(
model_name="tag",
constraint=models.UniqueConstraint(
condition=models.Q(("owner__isnull", True)),
fields=("name",),
name="documents_tag_name_uniq",
),
),
migrations.AlterField(
model_name="savedviewfilterrule",
name="rule_type",
field=models.PositiveIntegerField(
choices=[
(0, "title contains"),
(1, "content contains"),
(2, "ASN is"),
(3, "correspondent is"),
(4, "document type is"),
(5, "is in inbox"),
(6, "has tag"),
(7, "has any tag"),
(8, "created before"),
(9, "created after"),
(10, "created year is"),
(11, "created month is"),
(12, "created day is"),
(13, "added before"),
(14, "added after"),
(15, "modified before"),
(16, "modified after"),
(17, "does not have tag"),
(18, "does not have ASN"),
(19, "title or content contains"),
(20, "fulltext query"),
(21, "more like this"),
(22, "has tags in"),
(23, "ASN greater than"),
(24, "ASN less than"),
(25, "storage path is"),
(26, "has correspondent in"),
(27, "does not have correspondent in"),
(28, "has document type in"),
(29, "does not have document type in"),
(30, "has storage path in"),
(31, "does not have storage path in"),
],
verbose_name="rule type",
),
),
migrations.RenameModel(
old_name="Comment",
new_name="Note",
),
migrations.RenameField(
model_name="note",
old_name="comment",
new_name="note",
),
migrations.AlterModelOptions(
name="note",
options={
"ordering": ("created",),
"verbose_name": "note",
"verbose_name_plural": "notes",
},
),
migrations.AlterField(
model_name="note",
name="document",
field=models.ForeignKey(
blank=True,
null=True,
on_delete=django.db.models.deletion.CASCADE,
related_name="notes",
to="documents.document",
verbose_name="document",
),
),
migrations.AlterField(
model_name="note",
name="note",
field=models.TextField(
blank=True,
help_text="Note for the document",
verbose_name="content",
),
),
migrations.AlterField(
model_name="note",
name="user",
field=models.ForeignKey(
blank=True,
null=True,
on_delete=django.db.models.deletion.SET_NULL,
related_name="notes",
to=settings.AUTH_USER_MODEL,
verbose_name="user",
),
),
migrations.AlterField(
model_name="savedviewfilterrule",
name="rule_type",
field=models.PositiveIntegerField(
choices=[
(0, "title contains"),
(1, "content contains"),
(2, "ASN is"),
(3, "correspondent is"),
(4, "document type is"),
(5, "is in inbox"),
(6, "has tag"),
(7, "has any tag"),
(8, "created before"),
(9, "created after"),
(10, "created year is"),
(11, "created month is"),
(12, "created day is"),
(13, "added before"),
(14, "added after"),
(15, "modified before"),
(16, "modified after"),
(17, "does not have tag"),
(18, "does not have ASN"),
(19, "title or content contains"),
(20, "fulltext query"),
(21, "more like this"),
(22, "has tags in"),
(23, "ASN greater than"),
(24, "ASN less than"),
(25, "storage path is"),
(26, "has correspondent in"),
(27, "does not have correspondent in"),
(28, "has document type in"),
(29, "does not have document type in"),
(30, "has storage path in"),
(31, "does not have storage path in"),
(32, "owner is"),
(33, "has owner in"),
(34, "does not have owner"),
(35, "does not have owner in"),
],
verbose_name="rule type",
),
),
]

View File

@@ -0,0 +1,70 @@
import django.utils.timezone
from django.conf import settings
from django.db import migrations
from django.db import models
class Migration(migrations.Migration):
dependencies = [
("documents", "1022_paperlesstask"),
]
operations = [
migrations.CreateModel(
name="Comment",
fields=[
(
"id",
models.AutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
(
"comment",
models.TextField(
blank=True,
help_text="Comment for the document",
verbose_name="content",
),
),
(
"created",
models.DateTimeField(
db_index=True,
default=django.utils.timezone.now,
verbose_name="created",
),
),
(
"document",
models.ForeignKey(
blank=True,
null=True,
on_delete=django.db.models.deletion.CASCADE,
related_name="documents",
to="documents.document",
verbose_name="document",
),
),
(
"user",
models.ForeignKey(
blank=True,
null=True,
on_delete=django.db.models.deletion.SET_NULL,
related_name="users",
to=settings.AUTH_USER_MODEL,
verbose_name="user",
),
),
],
options={
"verbose_name": "comment",
"verbose_name_plural": "comments",
"ordering": ("created",),
},
),
]

View File

@@ -0,0 +1,25 @@
# Generated by Django 4.0.6 on 2022-07-25 06:34
from django.db import migrations
from django.db import models
class Migration(migrations.Migration):
dependencies = [
("documents", "1023_add_comments"),
]
operations = [
migrations.AddField(
model_name="document",
name="original_filename",
field=models.CharField(
default=None,
editable=False,
help_text="The original name of the file when it was uploaded",
max_length=1024,
null=True,
verbose_name="original filename",
),
),
]

View File

@@ -0,0 +1,48 @@
# Generated by Django 4.0.5 on 2022-08-26 16:49
from django.db import migrations
from django.db import models
class Migration(migrations.Migration):
dependencies = [
("documents", "1024_document_original_filename"),
]
operations = [
migrations.AlterField(
model_name="savedviewfilterrule",
name="rule_type",
field=models.PositiveIntegerField(
choices=[
(0, "title contains"),
(1, "content contains"),
(2, "ASN is"),
(3, "correspondent is"),
(4, "document type is"),
(5, "is in inbox"),
(6, "has tag"),
(7, "has any tag"),
(8, "created before"),
(9, "created after"),
(10, "created year is"),
(11, "created month is"),
(12, "created day is"),
(13, "added before"),
(14, "added after"),
(15, "modified before"),
(16, "modified after"),
(17, "does not have tag"),
(18, "does not have ASN"),
(19, "title or content contains"),
(20, "fulltext query"),
(21, "more like this"),
(22, "has tags in"),
(23, "ASN greater than"),
(24, "ASN less than"),
(25, "storage path is"),
],
verbose_name="rule type",
),
),
]

View File

@@ -0,0 +1,60 @@
# Generated by Django 4.1.1 on 2022-09-27 19:31
import django.db.models.deletion
from django.db import migrations
from django.db import models
class Migration(migrations.Migration):
dependencies = [
("django_celery_results", "0011_taskresult_periodic_task_name"),
("documents", "1025_alter_savedviewfilterrule_rule_type"),
]
operations = [
migrations.RemoveField(
model_name="paperlesstask",
name="created",
),
migrations.RemoveField(
model_name="paperlesstask",
name="name",
),
migrations.RemoveField(
model_name="paperlesstask",
name="started",
),
# Remove the field from the model
migrations.RemoveField(
model_name="paperlesstask",
name="attempted_task",
),
# Add the field back, pointing to the correct model
# This resolves a problem where the temporary change in 1022
# results in a type mismatch
migrations.AddField(
model_name="paperlesstask",
name="attempted_task",
field=models.OneToOneField(
blank=True,
null=True,
on_delete=django.db.models.deletion.CASCADE,
related_name="attempted_task",
to="django_celery_results.taskresult",
),
),
# Drop the django-q tables entirely
# Must be done last or there could be references here
migrations.RunSQL(
"DROP TABLE IF EXISTS django_q_ormq",
reverse_sql=migrations.RunSQL.noop,
),
migrations.RunSQL(
"DROP TABLE IF EXISTS django_q_schedule",
reverse_sql=migrations.RunSQL.noop,
),
migrations.RunSQL(
"DROP TABLE IF EXISTS django_q_task",
reverse_sql=migrations.RunSQL.noop,
),
]

View File

@@ -0,0 +1,134 @@
# Generated by Django 4.1.2 on 2022-10-17 16:31
import django.utils.timezone
from django.db import migrations
from django.db import models
class Migration(migrations.Migration):
dependencies = [
("documents", "1026_transition_to_celery"),
]
operations = [
migrations.RemoveField(
model_name="paperlesstask",
name="attempted_task",
),
migrations.AddField(
model_name="paperlesstask",
name="date_created",
field=models.DateTimeField(
default=django.utils.timezone.now,
help_text="Datetime field when the task result was created in UTC",
null=True,
verbose_name="Created DateTime",
),
),
migrations.AddField(
model_name="paperlesstask",
name="date_done",
field=models.DateTimeField(
default=None,
help_text="Datetime field when the task was completed in UTC",
null=True,
verbose_name="Completed DateTime",
),
),
migrations.AddField(
model_name="paperlesstask",
name="date_started",
field=models.DateTimeField(
default=None,
help_text="Datetime field when the task was started in UTC",
null=True,
verbose_name="Started DateTime",
),
),
migrations.AddField(
model_name="paperlesstask",
name="result",
field=models.TextField(
default=None,
help_text="The data returned by the task",
null=True,
verbose_name="Result Data",
),
),
migrations.AddField(
model_name="paperlesstask",
name="status",
field=models.CharField(
choices=[
("FAILURE", "FAILURE"),
("PENDING", "PENDING"),
("RECEIVED", "RECEIVED"),
("RETRY", "RETRY"),
("REVOKED", "REVOKED"),
("STARTED", "STARTED"),
("SUCCESS", "SUCCESS"),
],
default="PENDING",
help_text="Current state of the task being run",
max_length=30,
verbose_name="Task State",
),
),
migrations.AddField(
model_name="paperlesstask",
name="task_args",
field=models.JSONField(
help_text="JSON representation of the positional arguments used with the task",
null=True,
verbose_name="Task Positional Arguments",
),
),
migrations.AddField(
model_name="paperlesstask",
name="task_file_name",
field=models.CharField(
help_text="Name of the file which the Task was run for",
max_length=255,
null=True,
verbose_name="Task Name",
),
),
migrations.AddField(
model_name="paperlesstask",
name="task_kwargs",
field=models.JSONField(
help_text="JSON representation of the named arguments used with the task",
null=True,
verbose_name="Task Named Arguments",
),
),
migrations.AddField(
model_name="paperlesstask",
name="task_name",
field=models.CharField(
help_text="Name of the Task which was run",
max_length=255,
null=True,
verbose_name="Task Name",
),
),
migrations.AlterField(
model_name="paperlesstask",
name="acknowledged",
field=models.BooleanField(
default=False,
help_text="If the task is acknowledged via the frontend or API",
verbose_name="Acknowledged",
),
),
migrations.AlterField(
model_name="paperlesstask",
name="task_id",
field=models.CharField(
help_text="Celery ID for the Task that was run",
max_length=255,
unique=True,
verbose_name="Task ID",
),
),
]

View File

@@ -0,0 +1,20 @@
# Generated by Django 4.1.3 on 2022-11-22 17:50
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
("documents", "1027_remove_paperlesstask_attempted_task_and_more"),
]
operations = [
migrations.RemoveField(
model_name="paperlesstask",
name="task_args",
),
migrations.RemoveField(
model_name="paperlesstask",
name="task_kwargs",
),
]

View File

@@ -0,0 +1,30 @@
# Generated by Django 4.1.4 on 2023-01-24 17:56
import django.core.validators
from django.db import migrations
from django.db import models
class Migration(migrations.Migration):
dependencies = [
("documents", "1028_remove_paperlesstask_task_args_and_more"),
]
operations = [
migrations.AlterField(
model_name="document",
name="archive_serial_number",
field=models.PositiveIntegerField(
blank=True,
db_index=True,
help_text="The position of this document in your physical document archive.",
null=True,
unique=True,
validators=[
django.core.validators.MaxValueValidator(4294967295),
django.core.validators.MinValueValidator(0),
],
verbose_name="archive serial number",
),
),
]

View File

@@ -0,0 +1,23 @@
# Generated by Django 4.1.5 on 2023-02-03 21:53
from django.db import migrations
from django.db import models
class Migration(migrations.Migration):
dependencies = [
("documents", "1029_alter_document_archive_serial_number"),
]
operations = [
migrations.AlterField(
model_name="paperlesstask",
name="task_file_name",
field=models.CharField(
help_text="Name of the file which the Task was run for",
max_length=255,
null=True,
verbose_name="Task Filename",
),
),
]

Some files were not shown because too many files have changed in this diff Show More