mirror of
https://github.com/idrainformatica/Linfa.git
synced 2026-04-19 16:53:47 +02:00
v1
This commit is contained in:
5
src/documents/__init__.py
Normal file
5
src/documents/__init__.py
Normal file
@@ -0,0 +1,5 @@
|
||||
# this is here so that django finds the checks.
|
||||
from documents.checks import changed_password_check
|
||||
from documents.checks import parser_check
|
||||
|
||||
__all__ = ["changed_password_check", "parser_check"]
|
||||
229
src/documents/admin.py
Normal file
229
src/documents/admin.py
Normal file
@@ -0,0 +1,229 @@
|
||||
from django.conf import settings
|
||||
from django.contrib import admin
|
||||
from guardian.admin import GuardedModelAdmin
|
||||
from treenode.admin import TreeNodeModelAdmin
|
||||
|
||||
from documents.models import Correspondent
|
||||
from documents.models import CustomField
|
||||
from documents.models import CustomFieldInstance
|
||||
from documents.models import Document
|
||||
from documents.models import DocumentType
|
||||
from documents.models import Note
|
||||
from documents.models import PaperlessTask
|
||||
from documents.models import SavedView
|
||||
from documents.models import SavedViewFilterRule
|
||||
from documents.models import ShareLink
|
||||
from documents.models import StoragePath
|
||||
from documents.models import Tag
|
||||
from documents.tasks import update_document_parent_tags
|
||||
|
||||
if settings.AUDIT_LOG_ENABLED:
|
||||
from auditlog.admin import LogEntryAdmin
|
||||
from auditlog.models import LogEntry
|
||||
|
||||
|
||||
class CorrespondentAdmin(GuardedModelAdmin):
|
||||
list_display = ("name", "match", "matching_algorithm")
|
||||
list_filter = ("matching_algorithm",)
|
||||
list_editable = ("match", "matching_algorithm")
|
||||
|
||||
|
||||
class TagAdmin(GuardedModelAdmin, TreeNodeModelAdmin):
|
||||
list_display = ("name", "color", "match", "matching_algorithm")
|
||||
list_filter = ("matching_algorithm",)
|
||||
list_editable = ("color", "match", "matching_algorithm")
|
||||
search_fields = ("color", "name")
|
||||
|
||||
def save_model(self, request, obj, form, change):
|
||||
old_parent = None
|
||||
if change and obj.pk:
|
||||
tag = Tag.objects.get(pk=obj.pk)
|
||||
old_parent = tag.get_parent() if tag else None
|
||||
|
||||
super().save_model(request, obj, form, change)
|
||||
|
||||
# sync parent tags on documents if changed
|
||||
new_parent = obj.get_parent()
|
||||
if new_parent and old_parent != new_parent:
|
||||
update_document_parent_tags(obj, new_parent)
|
||||
|
||||
|
||||
class DocumentTypeAdmin(GuardedModelAdmin):
|
||||
list_display = ("name", "match", "matching_algorithm")
|
||||
list_filter = ("matching_algorithm",)
|
||||
list_editable = ("match", "matching_algorithm")
|
||||
|
||||
|
||||
class DocumentAdmin(GuardedModelAdmin):
|
||||
search_fields = ("correspondent__name", "title", "content", "tags__name")
|
||||
readonly_fields = (
|
||||
"added",
|
||||
"modified",
|
||||
"mime_type",
|
||||
"storage_type",
|
||||
"filename",
|
||||
"checksum",
|
||||
"archive_filename",
|
||||
"archive_checksum",
|
||||
"original_filename",
|
||||
"deleted_at",
|
||||
)
|
||||
|
||||
list_display_links = ("title",)
|
||||
|
||||
list_display = ("id", "title", "mime_type", "filename", "archive_filename")
|
||||
|
||||
list_filter = (
|
||||
("mime_type"),
|
||||
("archive_serial_number", admin.EmptyFieldListFilter),
|
||||
("archive_filename", admin.EmptyFieldListFilter),
|
||||
)
|
||||
|
||||
filter_horizontal = ("tags",)
|
||||
|
||||
ordering = ["-id"]
|
||||
|
||||
date_hierarchy = "created"
|
||||
|
||||
def has_add_permission(self, request):
|
||||
return False
|
||||
|
||||
def created_(self, obj):
|
||||
return obj.created.date().strftime("%Y-%m-%d")
|
||||
|
||||
created_.short_description = "Created"
|
||||
|
||||
def get_queryset(self, request): # pragma: no cover
|
||||
"""
|
||||
Include trashed documents
|
||||
"""
|
||||
return Document.global_objects.all()
|
||||
|
||||
def delete_queryset(self, request, queryset):
|
||||
from documents import index
|
||||
|
||||
with index.open_index_writer() as writer:
|
||||
for o in queryset:
|
||||
index.remove_document(writer, o)
|
||||
|
||||
super().delete_queryset(request, queryset)
|
||||
|
||||
def delete_model(self, request, obj):
|
||||
from documents import index
|
||||
|
||||
index.remove_document_from_index(obj)
|
||||
super().delete_model(request, obj)
|
||||
|
||||
def save_model(self, request, obj, form, change):
|
||||
from documents import index
|
||||
|
||||
index.add_or_update_document(obj)
|
||||
super().save_model(request, obj, form, change)
|
||||
|
||||
|
||||
class RuleInline(admin.TabularInline):
|
||||
model = SavedViewFilterRule
|
||||
|
||||
|
||||
class SavedViewAdmin(GuardedModelAdmin):
|
||||
list_display = ("name", "owner")
|
||||
|
||||
inlines = [RuleInline]
|
||||
|
||||
def get_queryset(self, request): # pragma: no cover
|
||||
return super().get_queryset(request).select_related("owner")
|
||||
|
||||
|
||||
class StoragePathInline(admin.TabularInline):
|
||||
model = StoragePath
|
||||
|
||||
|
||||
class StoragePathAdmin(GuardedModelAdmin):
|
||||
list_display = ("name", "path", "match", "matching_algorithm")
|
||||
list_filter = ("path", "matching_algorithm")
|
||||
list_editable = ("path", "match", "matching_algorithm")
|
||||
|
||||
|
||||
class TaskAdmin(admin.ModelAdmin):
|
||||
list_display = ("task_id", "task_file_name", "task_name", "date_done", "status")
|
||||
list_filter = ("status", "date_done", "task_name")
|
||||
search_fields = ("task_name", "task_id", "status", "task_file_name")
|
||||
readonly_fields = (
|
||||
"task_id",
|
||||
"task_file_name",
|
||||
"task_name",
|
||||
"status",
|
||||
"date_created",
|
||||
"date_started",
|
||||
"date_done",
|
||||
"result",
|
||||
)
|
||||
|
||||
|
||||
class NotesAdmin(GuardedModelAdmin):
|
||||
list_display = ("user", "created", "note", "document")
|
||||
list_filter = ("created", "user")
|
||||
list_display_links = ("created",)
|
||||
raw_id_fields = ("document",)
|
||||
search_fields = ("document__title",)
|
||||
|
||||
def get_queryset(self, request): # pragma: no cover
|
||||
return (
|
||||
super()
|
||||
.get_queryset(request)
|
||||
.select_related("user", "document__correspondent")
|
||||
)
|
||||
|
||||
|
||||
class ShareLinksAdmin(GuardedModelAdmin):
|
||||
list_display = ("created", "expiration", "document")
|
||||
list_filter = ("created", "expiration", "owner")
|
||||
list_display_links = ("created",)
|
||||
raw_id_fields = ("document",)
|
||||
|
||||
def get_queryset(self, request): # pragma: no cover
|
||||
return super().get_queryset(request).select_related("document__correspondent")
|
||||
|
||||
|
||||
class CustomFieldsAdmin(GuardedModelAdmin):
|
||||
fields = ("name", "created", "data_type")
|
||||
readonly_fields = ("created", "data_type")
|
||||
list_display = ("name", "created", "data_type")
|
||||
list_filter = ("created", "data_type")
|
||||
|
||||
|
||||
class CustomFieldInstancesAdmin(GuardedModelAdmin):
|
||||
fields = ("field", "document", "created", "value")
|
||||
readonly_fields = ("field", "document", "created", "value")
|
||||
list_display = ("field", "document", "value", "created")
|
||||
search_fields = ("document__title",)
|
||||
list_filter = ("created", "field")
|
||||
|
||||
def get_queryset(self, request): # pragma: no cover
|
||||
return (
|
||||
super()
|
||||
.get_queryset(request)
|
||||
.select_related("field", "document__correspondent")
|
||||
)
|
||||
|
||||
|
||||
admin.site.register(Correspondent, CorrespondentAdmin)
|
||||
admin.site.register(Tag, TagAdmin)
|
||||
admin.site.register(DocumentType, DocumentTypeAdmin)
|
||||
admin.site.register(Document, DocumentAdmin)
|
||||
admin.site.register(SavedView, SavedViewAdmin)
|
||||
admin.site.register(StoragePath, StoragePathAdmin)
|
||||
admin.site.register(PaperlessTask, TaskAdmin)
|
||||
admin.site.register(Note, NotesAdmin)
|
||||
admin.site.register(ShareLink, ShareLinksAdmin)
|
||||
admin.site.register(CustomField, CustomFieldsAdmin)
|
||||
admin.site.register(CustomFieldInstance, CustomFieldInstancesAdmin)
|
||||
|
||||
if settings.AUDIT_LOG_ENABLED:
|
||||
|
||||
class LogEntryAUDIT(LogEntryAdmin):
|
||||
def has_delete_permission(self, request, obj=None):
|
||||
return False
|
||||
|
||||
admin.site.unregister(LogEntry)
|
||||
admin.site.register(LogEntry, LogEntryAUDIT)
|
||||
33
src/documents/apps.py
Normal file
33
src/documents/apps.py
Normal file
@@ -0,0 +1,33 @@
|
||||
from django.apps import AppConfig
|
||||
from django.utils.translation import gettext_lazy as _
|
||||
|
||||
|
||||
class DocumentsConfig(AppConfig):
|
||||
name = "documents"
|
||||
|
||||
verbose_name = _("Documents")
|
||||
|
||||
def ready(self):
|
||||
from documents.signals import document_consumption_finished
|
||||
from documents.signals import document_updated
|
||||
from documents.signals.handlers import add_inbox_tags
|
||||
from documents.signals.handlers import add_to_index
|
||||
from documents.signals.handlers import run_workflows_added
|
||||
from documents.signals.handlers import run_workflows_updated
|
||||
from documents.signals.handlers import set_correspondent
|
||||
from documents.signals.handlers import set_document_type
|
||||
from documents.signals.handlers import set_storage_path
|
||||
from documents.signals.handlers import set_tags
|
||||
|
||||
document_consumption_finished.connect(add_inbox_tags)
|
||||
document_consumption_finished.connect(set_correspondent)
|
||||
document_consumption_finished.connect(set_document_type)
|
||||
document_consumption_finished.connect(set_tags)
|
||||
document_consumption_finished.connect(set_storage_path)
|
||||
document_consumption_finished.connect(add_to_index)
|
||||
document_consumption_finished.connect(run_workflows_added)
|
||||
document_updated.connect(run_workflows_updated)
|
||||
|
||||
import documents.schema # noqa: F401
|
||||
|
||||
AppConfig.ready(self)
|
||||
496
src/documents/barcodes.py
Normal file
496
src/documents/barcodes.py
Normal file
@@ -0,0 +1,496 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import re
|
||||
import tempfile
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from django.conf import settings
|
||||
from pdf2image import convert_from_path
|
||||
from pikepdf import Page
|
||||
from pikepdf import PasswordError
|
||||
from pikepdf import Pdf
|
||||
|
||||
from documents.converters import convert_from_tiff_to_pdf
|
||||
from documents.data_models import ConsumableDocument
|
||||
from documents.data_models import DocumentMetadataOverrides
|
||||
from documents.models import Tag
|
||||
from documents.plugins.base import ConsumeTaskPlugin
|
||||
from documents.plugins.base import StopConsumeTaskError
|
||||
from documents.plugins.helpers import ProgressManager
|
||||
from documents.plugins.helpers import ProgressStatusOptions
|
||||
from documents.utils import copy_basic_file_stats
|
||||
from documents.utils import copy_file_with_basic_stats
|
||||
from documents.utils import maybe_override_pixel_limit
|
||||
from paperless.config import BarcodeConfig
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Callable
|
||||
|
||||
from PIL import Image
|
||||
|
||||
logger = logging.getLogger("paperless.barcodes")
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class Barcode:
|
||||
"""
|
||||
Holds the information about a single barcode and its location in a document
|
||||
"""
|
||||
|
||||
page: int
|
||||
value: str
|
||||
settings: BarcodeConfig
|
||||
|
||||
@property
|
||||
def is_separator(self) -> bool:
|
||||
"""
|
||||
Returns True if the barcode value equals the configured separation value,
|
||||
False otherwise
|
||||
"""
|
||||
return self.value == self.settings.barcode_string
|
||||
|
||||
@property
|
||||
def is_asn(self) -> bool:
|
||||
"""
|
||||
Returns True if the barcode value matches the configured ASN prefix,
|
||||
False otherwise
|
||||
"""
|
||||
return self.value.startswith(self.settings.barcode_asn_prefix)
|
||||
|
||||
|
||||
class BarcodePlugin(ConsumeTaskPlugin):
|
||||
NAME: str = "BarcodePlugin"
|
||||
|
||||
@property
|
||||
def able_to_run(self) -> bool:
|
||||
"""
|
||||
Able to run if:
|
||||
- ASN from barcode detection is enabled or
|
||||
- Barcode support is enabled and the mime type is supported
|
||||
"""
|
||||
if self.settings.barcode_enable_tiff_support:
|
||||
supported_mimes: set[str] = {"application/pdf", "image/tiff"}
|
||||
else:
|
||||
supported_mimes = {"application/pdf"}
|
||||
|
||||
return (
|
||||
self.settings.barcode_enable_asn
|
||||
or self.settings.barcodes_enabled
|
||||
or self.settings.barcode_enable_tag
|
||||
) and self.input_doc.mime_type in supported_mimes
|
||||
|
||||
def get_settings(self) -> BarcodeConfig:
|
||||
"""
|
||||
Returns the settings for this plugin (Django settings or app config)
|
||||
"""
|
||||
return BarcodeConfig()
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
input_doc: ConsumableDocument,
|
||||
metadata: DocumentMetadataOverrides,
|
||||
status_mgr: ProgressManager,
|
||||
base_tmp_dir: Path,
|
||||
task_id: str,
|
||||
) -> None:
|
||||
super().__init__(
|
||||
input_doc,
|
||||
metadata,
|
||||
status_mgr,
|
||||
base_tmp_dir,
|
||||
task_id,
|
||||
)
|
||||
# need these for able_to_run
|
||||
self.settings = self.get_settings()
|
||||
|
||||
def setup(self) -> None:
|
||||
self.temp_dir = tempfile.TemporaryDirectory(
|
||||
dir=self.base_tmp_dir,
|
||||
prefix="barcode",
|
||||
)
|
||||
self.pdf_file: Path = self.input_doc.original_file
|
||||
self._tiff_conversion_done = False
|
||||
self.barcodes: list[Barcode] = []
|
||||
|
||||
def run(self) -> None:
|
||||
# Some operations may use PIL, override pixel setting if needed
|
||||
maybe_override_pixel_limit()
|
||||
|
||||
# Maybe do the conversion of TIFF to PDF
|
||||
self.convert_from_tiff_to_pdf()
|
||||
|
||||
# Locate any barcodes in the files
|
||||
self.detect()
|
||||
|
||||
# try reading tags from barcodes
|
||||
if (
|
||||
self.settings.barcode_enable_tag
|
||||
and (tags := self.tags) is not None
|
||||
and len(tags) > 0
|
||||
):
|
||||
if self.metadata.tag_ids:
|
||||
self.metadata.tag_ids += tags
|
||||
else:
|
||||
self.metadata.tag_ids = tags
|
||||
logger.info(f"Found tags in barcode: {tags}")
|
||||
|
||||
# Lastly attempt to split documents
|
||||
if self.settings.barcodes_enabled and (
|
||||
separator_pages := self.get_separation_pages()
|
||||
):
|
||||
# We have pages to split against
|
||||
|
||||
# Note this does NOT use the base_temp_dir, as that will be removed
|
||||
tmp_dir = Path(
|
||||
tempfile.mkdtemp(
|
||||
dir=settings.SCRATCH_DIR,
|
||||
prefix="paperless-barcode-split-",
|
||||
),
|
||||
).resolve()
|
||||
|
||||
from documents import tasks
|
||||
|
||||
# Create the split document tasks
|
||||
for new_document in self.separate_pages(separator_pages):
|
||||
copy_file_with_basic_stats(new_document, tmp_dir / new_document.name)
|
||||
|
||||
task = tasks.consume_file.delay(
|
||||
ConsumableDocument(
|
||||
# Same source, for templates
|
||||
source=self.input_doc.source,
|
||||
mailrule_id=self.input_doc.mailrule_id,
|
||||
# Can't use same folder or the consume might grab it again
|
||||
original_file=(tmp_dir / new_document.name).resolve(),
|
||||
# Adding optional original_path for later uses in
|
||||
# workflow matching
|
||||
original_path=self.input_doc.original_file,
|
||||
),
|
||||
# All the same metadata
|
||||
self.metadata,
|
||||
)
|
||||
logger.info(f"Created new task {task.id} for {new_document.name}")
|
||||
|
||||
# This file is now two or more files
|
||||
self.input_doc.original_file.unlink()
|
||||
|
||||
msg = "Barcode splitting complete!"
|
||||
|
||||
# Update the progress to complete
|
||||
self.status_mgr.send_progress(ProgressStatusOptions.SUCCESS, msg, 100, 100)
|
||||
|
||||
# Request the consume task stops
|
||||
raise StopConsumeTaskError(msg)
|
||||
|
||||
# Update/overwrite an ASN if possible
|
||||
# After splitting, as otherwise each split document gets the same ASN
|
||||
if self.settings.barcode_enable_asn and (located_asn := self.asn) is not None:
|
||||
logger.info(f"Found ASN in barcode: {located_asn}")
|
||||
self.metadata.asn = located_asn
|
||||
|
||||
def cleanup(self) -> None:
|
||||
self.temp_dir.cleanup()
|
||||
|
||||
def convert_from_tiff_to_pdf(self) -> None:
|
||||
"""
|
||||
May convert a TIFF image into a PDF, if the input is a TIFF and
|
||||
the TIFF has not been made into a PDF
|
||||
"""
|
||||
# Nothing to do, pdf_file is already assigned correctly
|
||||
if self.input_doc.mime_type != "image/tiff" or self._tiff_conversion_done:
|
||||
return
|
||||
|
||||
self.pdf_file = convert_from_tiff_to_pdf(
|
||||
self.input_doc.original_file,
|
||||
Path(self.temp_dir.name),
|
||||
)
|
||||
self._tiff_conversion_done = True
|
||||
|
||||
@staticmethod
|
||||
def read_barcodes_zxing(image: Image.Image) -> list[str]:
|
||||
barcodes = []
|
||||
|
||||
import zxingcpp
|
||||
|
||||
detected_barcodes = zxingcpp.read_barcodes(image)
|
||||
for barcode in detected_barcodes:
|
||||
if barcode.text:
|
||||
barcodes.append(barcode.text)
|
||||
logger.debug(
|
||||
f"Barcode of type {barcode.format} found: {barcode.text}",
|
||||
)
|
||||
|
||||
return barcodes
|
||||
|
||||
@staticmethod
|
||||
def read_barcodes_pyzbar(image: Image.Image) -> list[str]:
|
||||
barcodes = []
|
||||
|
||||
from pyzbar import pyzbar
|
||||
|
||||
# Decode the barcode image
|
||||
detected_barcodes = pyzbar.decode(image)
|
||||
|
||||
# Traverse through all the detected barcodes in image
|
||||
for barcode in detected_barcodes:
|
||||
if barcode.data:
|
||||
decoded_barcode = barcode.data.decode("utf-8")
|
||||
barcodes.append(decoded_barcode)
|
||||
logger.debug(
|
||||
f"Barcode of type {barcode.type} found: {decoded_barcode}",
|
||||
)
|
||||
|
||||
return barcodes
|
||||
|
||||
def detect(self) -> None:
|
||||
"""
|
||||
Scan all pages of the PDF as images, updating barcodes and the pages
|
||||
found on as we go
|
||||
"""
|
||||
# Bail if barcodes already exist
|
||||
if self.barcodes:
|
||||
return
|
||||
|
||||
# No op if not a TIFF
|
||||
self.convert_from_tiff_to_pdf()
|
||||
|
||||
# Choose the library for reading
|
||||
if settings.CONSUMER_BARCODE_SCANNER == "PYZBAR":
|
||||
reader: Callable[[Image.Image], list[str]] = self.read_barcodes_pyzbar
|
||||
logger.debug("Scanning for barcodes using PYZBAR")
|
||||
else:
|
||||
reader = self.read_barcodes_zxing
|
||||
logger.debug("Scanning for barcodes using ZXING")
|
||||
|
||||
try:
|
||||
# Read number of pages from pdf
|
||||
with Pdf.open(self.pdf_file) as pdf:
|
||||
num_of_pages = len(pdf.pages)
|
||||
logger.debug(f"PDF has {num_of_pages} pages")
|
||||
|
||||
# Get limit from configuration
|
||||
barcode_max_pages: int = (
|
||||
num_of_pages
|
||||
if self.settings.barcode_max_pages == 0
|
||||
else self.settings.barcode_max_pages
|
||||
)
|
||||
|
||||
if barcode_max_pages < num_of_pages: # pragma: no cover
|
||||
logger.debug(
|
||||
f"Barcodes detection will be limited to the first {barcode_max_pages} pages",
|
||||
)
|
||||
|
||||
# Loop al page
|
||||
for current_page_number in range(min(num_of_pages, barcode_max_pages)):
|
||||
logger.debug(f"Processing page {current_page_number}")
|
||||
|
||||
# Convert page to image
|
||||
page = convert_from_path(
|
||||
self.pdf_file,
|
||||
dpi=self.settings.barcode_dpi,
|
||||
output_folder=self.temp_dir.name,
|
||||
first_page=current_page_number + 1,
|
||||
last_page=current_page_number + 1,
|
||||
)[0]
|
||||
|
||||
# Remember filename, since it is lost by upscaling
|
||||
page_filepath = Path(page.filename)
|
||||
logger.debug(f"Image is at {page_filepath}")
|
||||
|
||||
# Upscale image if configured
|
||||
factor = self.settings.barcode_upscale
|
||||
if factor > 1.0:
|
||||
logger.debug(
|
||||
f"Upscaling image by {factor} for better barcode detection",
|
||||
)
|
||||
x, y = page.size
|
||||
page = page.resize(
|
||||
(round(x * factor), (round(y * factor))),
|
||||
)
|
||||
|
||||
# Detect barcodes
|
||||
for barcode_value in reader(page):
|
||||
self.barcodes.append(
|
||||
Barcode(current_page_number, barcode_value, self.settings),
|
||||
)
|
||||
|
||||
# Delete temporary image file
|
||||
page_filepath.unlink()
|
||||
|
||||
# Password protected files can't be checked
|
||||
# This is the exception raised for those
|
||||
except PasswordError as e:
|
||||
logger.warning(
|
||||
f"File is likely password protected, not checking for barcodes: {e}",
|
||||
)
|
||||
# This file is really borked, allow the consumption to continue
|
||||
# but it may fail further on
|
||||
except Exception as e: # pragma: no cover
|
||||
logger.warning(
|
||||
f"Exception during barcode scanning: {e}",
|
||||
)
|
||||
|
||||
@property
|
||||
def asn(self) -> int | None:
|
||||
"""
|
||||
Search the parsed barcodes for any ASNs.
|
||||
The first barcode that starts with barcode_asn_prefix
|
||||
is considered the ASN to be used.
|
||||
Returns the detected ASN (or None)
|
||||
"""
|
||||
asn = None
|
||||
|
||||
# Ensure the barcodes have been read
|
||||
self.detect()
|
||||
|
||||
# get the first barcode that starts with barcode_asn_prefix
|
||||
asn_text: str | None = next(
|
||||
(x.value for x in self.barcodes if x.is_asn),
|
||||
None,
|
||||
)
|
||||
|
||||
if asn_text:
|
||||
logger.debug(f"Found ASN Barcode: {asn_text}")
|
||||
# remove the prefix and remove whitespace
|
||||
asn_text = asn_text[len(self.settings.barcode_asn_prefix) :].strip()
|
||||
|
||||
# remove non-numeric parts of the remaining string
|
||||
asn_text = re.sub(r"\D", "", asn_text)
|
||||
|
||||
# now, try parsing the ASN number
|
||||
try:
|
||||
asn = int(asn_text)
|
||||
except ValueError as e:
|
||||
logger.warning(f"Failed to parse ASN number because: {e}")
|
||||
|
||||
return asn
|
||||
|
||||
@property
|
||||
def tags(self) -> list[int]:
|
||||
"""
|
||||
Search the parsed barcodes for any tags.
|
||||
Returns the detected tag ids (or empty list)
|
||||
"""
|
||||
tags: list[int] = []
|
||||
|
||||
# Ensure the barcodes have been read
|
||||
self.detect()
|
||||
|
||||
for x in self.barcodes:
|
||||
tag_texts: str = x.value
|
||||
|
||||
for raw in tag_texts.split(","):
|
||||
try:
|
||||
tag_str: str | None = None
|
||||
for regex in self.settings.barcode_tag_mapping:
|
||||
if re.match(regex, raw, flags=re.IGNORECASE):
|
||||
sub = self.settings.barcode_tag_mapping[regex]
|
||||
tag_str = (
|
||||
re.sub(regex, sub, raw, flags=re.IGNORECASE)
|
||||
if sub
|
||||
else raw
|
||||
)
|
||||
break
|
||||
|
||||
if tag_str:
|
||||
tag, _ = Tag.objects.get_or_create(
|
||||
name__iexact=tag_str,
|
||||
defaults={"name": tag_str},
|
||||
)
|
||||
|
||||
logger.debug(
|
||||
f"Found Tag Barcode '{raw}', substituted "
|
||||
f"to '{tag}' and mapped to "
|
||||
f"tag #{tag.pk}.",
|
||||
)
|
||||
tags.append(tag.pk)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Failed to find or create TAG '{raw}' because: {e}",
|
||||
)
|
||||
|
||||
return tags
|
||||
|
||||
def get_separation_pages(self) -> dict[int, bool]:
|
||||
"""
|
||||
Search the parsed barcodes for separators and returns a dict of page
|
||||
numbers, which separate the file into new files, together with the
|
||||
information whether to keep the page.
|
||||
"""
|
||||
# filter all barcodes for the separator string
|
||||
# get the page numbers of the separating barcodes
|
||||
retain = self.settings.barcode_retain_split_pages
|
||||
separator_pages = {
|
||||
bc.page: retain
|
||||
for bc in self.barcodes
|
||||
if bc.is_separator and (not retain or (retain and bc.page > 0))
|
||||
} # as below, dont include the first page if retain is enabled
|
||||
if not self.settings.barcode_enable_asn:
|
||||
return separator_pages
|
||||
|
||||
# add the page numbers of the ASN barcodes
|
||||
# (except for first page, that might lead to infinite loops).
|
||||
return {
|
||||
**separator_pages,
|
||||
**{bc.page: True for bc in self.barcodes if bc.is_asn and bc.page != 0},
|
||||
}
|
||||
|
||||
def separate_pages(self, pages_to_split_on: dict[int, bool]) -> list[Path]:
|
||||
"""
|
||||
Separate the provided pdf file on the pages_to_split_on.
|
||||
The pages which are defined by the keys in page_numbers
|
||||
will be removed if the corresponding value is false.
|
||||
Returns a list of (temporary) filepaths to consume.
|
||||
These will need to be deleted later.
|
||||
"""
|
||||
|
||||
document_paths = []
|
||||
fname: str = self.input_doc.original_file.stem
|
||||
with Pdf.open(self.pdf_file) as input_pdf:
|
||||
# Start with an empty document
|
||||
current_document: list[Page] = []
|
||||
# A list of documents, ie a list of lists of pages
|
||||
documents: list[list[Page]] = [current_document]
|
||||
|
||||
for idx, page in enumerate(input_pdf.pages):
|
||||
# Keep building the new PDF as long as it is not a
|
||||
# separator index
|
||||
if idx not in pages_to_split_on:
|
||||
current_document.append(page)
|
||||
continue
|
||||
|
||||
# This is a split index
|
||||
# Start a new destination page listing
|
||||
logger.debug(f"Starting new document at idx {idx}")
|
||||
current_document = []
|
||||
documents.append(current_document)
|
||||
keep_page: bool = pages_to_split_on[idx]
|
||||
if keep_page:
|
||||
# Keep the page
|
||||
# (new document is started by asn barcode)
|
||||
current_document.append(page)
|
||||
|
||||
documents = [x for x in documents if len(x)]
|
||||
|
||||
logger.debug(f"Split into {len(documents)} new documents")
|
||||
|
||||
# Write the new documents out
|
||||
for doc_idx, document in enumerate(documents):
|
||||
dst = Pdf.new()
|
||||
dst.pages.extend(document)
|
||||
|
||||
output_filename = f"{fname}_document_{doc_idx}.pdf"
|
||||
|
||||
logger.debug(f"pdf no:{doc_idx} has {len(dst.pages)} pages")
|
||||
savepath = Path(self.temp_dir.name) / output_filename
|
||||
with savepath.open("wb") as out:
|
||||
dst.save(out)
|
||||
|
||||
copy_basic_file_stats(self.input_doc.original_file, savepath)
|
||||
|
||||
document_paths.append(savepath)
|
||||
|
||||
return document_paths
|
||||
107
src/documents/bulk_download.py
Normal file
107
src/documents/bulk_download.py
Normal file
@@ -0,0 +1,107 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
from typing import NoReturn
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Callable
|
||||
from zipfile import ZipFile
|
||||
|
||||
from documents.models import Document
|
||||
|
||||
|
||||
class BulkArchiveStrategy:
|
||||
def __init__(self, zipf: ZipFile, *, follow_formatting: bool = False) -> None:
|
||||
self.zipf: ZipFile = zipf
|
||||
if follow_formatting:
|
||||
self.make_unique_filename: Callable[..., Path | str] = (
|
||||
self._formatted_filepath
|
||||
)
|
||||
else:
|
||||
self.make_unique_filename = self._filename_only
|
||||
|
||||
def _filename_only(
|
||||
self,
|
||||
doc: Document,
|
||||
*,
|
||||
archive: bool = False,
|
||||
folder: str = "",
|
||||
) -> str:
|
||||
"""
|
||||
Constructs a unique name for the given document to be used inside the
|
||||
zip file.
|
||||
|
||||
The filename might not be unique enough, so a counter is appended if needed
|
||||
"""
|
||||
counter = 0
|
||||
while True:
|
||||
filename: str = folder + doc.get_public_filename(
|
||||
archive=archive,
|
||||
counter=counter,
|
||||
)
|
||||
if filename in self.zipf.namelist():
|
||||
counter += 1
|
||||
else:
|
||||
return filename
|
||||
|
||||
def _formatted_filepath(
|
||||
self,
|
||||
doc: Document,
|
||||
*,
|
||||
archive: bool = False,
|
||||
folder: str = "",
|
||||
) -> Path:
|
||||
"""
|
||||
Constructs a full file path for the given document to be used inside
|
||||
the zipfile.
|
||||
|
||||
The path is already unique, as handled when a document is consumed or updated
|
||||
"""
|
||||
if archive and doc.has_archive_version:
|
||||
if TYPE_CHECKING:
|
||||
assert doc.archive_filename is not None
|
||||
in_archive_path: Path = Path(folder) / doc.archive_filename
|
||||
else:
|
||||
if TYPE_CHECKING:
|
||||
assert doc.filename is not None
|
||||
in_archive_path = Path(folder) / doc.filename
|
||||
|
||||
return in_archive_path
|
||||
|
||||
def add_document(self, doc: Document) -> NoReturn:
|
||||
raise NotImplementedError # pragma: no cover
|
||||
|
||||
|
||||
class OriginalsOnlyStrategy(BulkArchiveStrategy):
|
||||
def add_document(self, doc: Document) -> None:
|
||||
self.zipf.write(doc.source_path, self.make_unique_filename(doc))
|
||||
|
||||
|
||||
class ArchiveOnlyStrategy(BulkArchiveStrategy):
|
||||
def add_document(self, doc: Document) -> None:
|
||||
if doc.has_archive_version:
|
||||
if TYPE_CHECKING:
|
||||
assert doc.archive_path is not None
|
||||
self.zipf.write(
|
||||
doc.archive_path,
|
||||
self.make_unique_filename(doc, archive=True),
|
||||
)
|
||||
else:
|
||||
self.zipf.write(doc.source_path, self.make_unique_filename(doc))
|
||||
|
||||
|
||||
class OriginalAndArchiveStrategy(BulkArchiveStrategy):
|
||||
def add_document(self, doc: Document) -> None:
|
||||
if doc.has_archive_version:
|
||||
if TYPE_CHECKING:
|
||||
assert doc.archive_path is not None
|
||||
self.zipf.write(
|
||||
doc.archive_path,
|
||||
self.make_unique_filename(doc, archive=True, folder="archive/"),
|
||||
)
|
||||
|
||||
self.zipf.write(
|
||||
doc.source_path,
|
||||
self.make_unique_filename(doc, folder="originals/"),
|
||||
)
|
||||
728
src/documents/bulk_edit.py
Normal file
728
src/documents/bulk_edit.py
Normal file
@@ -0,0 +1,728 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import logging
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
from typing import Literal
|
||||
|
||||
from celery import chain
|
||||
from celery import chord
|
||||
from celery import group
|
||||
from celery import shared_task
|
||||
from django.conf import settings
|
||||
from django.db import transaction
|
||||
from django.db.models import Q
|
||||
from django.utils import timezone
|
||||
|
||||
from documents.data_models import ConsumableDocument
|
||||
from documents.data_models import DocumentMetadataOverrides
|
||||
from documents.data_models import DocumentSource
|
||||
from documents.models import Correspondent
|
||||
from documents.models import CustomField
|
||||
from documents.models import CustomFieldInstance
|
||||
from documents.models import Document
|
||||
from documents.models import DocumentType
|
||||
from documents.models import StoragePath
|
||||
from documents.models import Tag
|
||||
from documents.permissions import set_permissions_for_object
|
||||
from documents.plugins.helpers import DocumentsStatusManager
|
||||
from documents.tasks import bulk_update_documents
|
||||
from documents.tasks import consume_file
|
||||
from documents.tasks import update_document_content_maybe_archive_file
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from django.contrib.auth.models import User
|
||||
|
||||
logger: logging.Logger = logging.getLogger("paperless.bulk_edit")
|
||||
|
||||
|
||||
def set_correspondent(
|
||||
doc_ids: list[int],
|
||||
correspondent: Correspondent,
|
||||
) -> Literal["OK"]:
|
||||
if correspondent:
|
||||
correspondent = Correspondent.objects.only("pk").get(id=correspondent)
|
||||
|
||||
qs = (
|
||||
Document.objects.filter(Q(id__in=doc_ids) & ~Q(correspondent=correspondent))
|
||||
.select_related("correspondent")
|
||||
.only("pk", "correspondent__id")
|
||||
)
|
||||
affected_docs = list(qs.values_list("pk", flat=True))
|
||||
qs.update(correspondent=correspondent)
|
||||
|
||||
bulk_update_documents.delay(document_ids=affected_docs)
|
||||
|
||||
return "OK"
|
||||
|
||||
|
||||
def set_storage_path(doc_ids: list[int], storage_path: StoragePath) -> Literal["OK"]:
|
||||
if storage_path:
|
||||
storage_path = StoragePath.objects.only("pk").get(id=storage_path)
|
||||
|
||||
qs = (
|
||||
Document.objects.filter(
|
||||
Q(id__in=doc_ids) & ~Q(storage_path=storage_path),
|
||||
)
|
||||
.select_related("storage_path")
|
||||
.only("pk", "storage_path__id")
|
||||
)
|
||||
affected_docs = list(qs.values_list("pk", flat=True))
|
||||
qs.update(storage_path=storage_path)
|
||||
|
||||
bulk_update_documents.delay(
|
||||
document_ids=affected_docs,
|
||||
)
|
||||
|
||||
return "OK"
|
||||
|
||||
|
||||
def set_document_type(doc_ids: list[int], document_type: DocumentType) -> Literal["OK"]:
|
||||
if document_type:
|
||||
document_type = DocumentType.objects.only("pk").get(id=document_type)
|
||||
|
||||
qs = (
|
||||
Document.objects.filter(Q(id__in=doc_ids) & ~Q(document_type=document_type))
|
||||
.select_related("document_type")
|
||||
.only("pk", "document_type__id")
|
||||
)
|
||||
affected_docs = list(qs.values_list("pk", flat=True))
|
||||
qs.update(document_type=document_type)
|
||||
|
||||
bulk_update_documents.delay(document_ids=affected_docs)
|
||||
|
||||
return "OK"
|
||||
|
||||
|
||||
def add_tag(doc_ids: list[int], tag: int) -> Literal["OK"]:
|
||||
tag_obj = Tag.objects.get(pk=tag)
|
||||
tags_to_add = [tag_obj, *tag_obj.get_ancestors()]
|
||||
|
||||
DocumentTagRelationship = Document.tags.through
|
||||
to_create = []
|
||||
affected_docs: set[int] = set()
|
||||
|
||||
for t in tags_to_add:
|
||||
qs = Document.objects.filter(Q(id__in=doc_ids) & ~Q(tags__id=t.id)).only("pk")
|
||||
doc_ids_missing_tag = list(qs.values_list("pk", flat=True))
|
||||
affected_docs.update(doc_ids_missing_tag)
|
||||
to_create.extend(
|
||||
DocumentTagRelationship(document_id=doc, tag_id=t.id)
|
||||
for doc in doc_ids_missing_tag
|
||||
)
|
||||
|
||||
if to_create:
|
||||
DocumentTagRelationship.objects.bulk_create(to_create)
|
||||
|
||||
if affected_docs:
|
||||
bulk_update_documents.delay(document_ids=list(affected_docs))
|
||||
|
||||
return "OK"
|
||||
|
||||
|
||||
def remove_tag(doc_ids: list[int], tag: int) -> Literal["OK"]:
|
||||
tag_obj = Tag.objects.get(pk=tag)
|
||||
tag_ids = [tag_obj.id, *tag_obj.get_descendants_pks()]
|
||||
|
||||
DocumentTagRelationship = Document.tags.through
|
||||
qs = DocumentTagRelationship.objects.filter(
|
||||
document_id__in=doc_ids,
|
||||
tag_id__in=tag_ids,
|
||||
)
|
||||
affected_docs = list(qs.values_list("document_id", flat=True).distinct())
|
||||
qs.delete()
|
||||
|
||||
if affected_docs:
|
||||
bulk_update_documents.delay(document_ids=affected_docs)
|
||||
|
||||
return "OK"
|
||||
|
||||
|
||||
def modify_tags(
|
||||
doc_ids: list[int],
|
||||
add_tags: list[int],
|
||||
remove_tags: list[int],
|
||||
) -> Literal["OK"]:
|
||||
qs = Document.objects.filter(id__in=doc_ids).only("pk")
|
||||
affected_docs = list(qs.values_list("pk", flat=True))
|
||||
DocumentTagRelationship = Document.tags.through
|
||||
|
||||
# add with all ancestors
|
||||
expanded_add_tags: set[int] = set()
|
||||
add_tag_objects = Tag.objects.filter(pk__in=add_tags)
|
||||
for t in add_tag_objects:
|
||||
expanded_add_tags.add(int(t.id))
|
||||
expanded_add_tags.update(int(pk) for pk in t.get_ancestors_pks())
|
||||
|
||||
# remove with all descendants
|
||||
expanded_remove_tags: set[int] = set()
|
||||
remove_tag_objects = Tag.objects.filter(pk__in=remove_tags)
|
||||
for t in remove_tag_objects:
|
||||
expanded_remove_tags.add(int(t.id))
|
||||
expanded_remove_tags.update(int(pk) for pk in t.get_descendants_pks())
|
||||
|
||||
try:
|
||||
with transaction.atomic():
|
||||
if expanded_remove_tags:
|
||||
DocumentTagRelationship.objects.filter(
|
||||
document_id__in=affected_docs,
|
||||
tag_id__in=expanded_remove_tags,
|
||||
).delete()
|
||||
|
||||
to_create = []
|
||||
if expanded_add_tags:
|
||||
existing_pairs = set(
|
||||
DocumentTagRelationship.objects.filter(
|
||||
document_id__in=affected_docs,
|
||||
tag_id__in=expanded_add_tags,
|
||||
).values_list("document_id", "tag_id"),
|
||||
)
|
||||
|
||||
to_create = [
|
||||
DocumentTagRelationship(document_id=doc, tag_id=tag)
|
||||
for doc in affected_docs
|
||||
for tag in expanded_add_tags
|
||||
if (doc, tag) not in existing_pairs
|
||||
]
|
||||
|
||||
if to_create:
|
||||
DocumentTagRelationship.objects.bulk_create(
|
||||
to_create,
|
||||
ignore_conflicts=True,
|
||||
)
|
||||
|
||||
if affected_docs:
|
||||
bulk_update_documents.delay(document_ids=affected_docs)
|
||||
except Exception as e:
|
||||
logger.error(f"Error modifying tags: {e}")
|
||||
return "ERROR"
|
||||
|
||||
return "OK"
|
||||
|
||||
|
||||
def modify_custom_fields(
|
||||
doc_ids: list[int],
|
||||
add_custom_fields: list[int] | dict,
|
||||
remove_custom_fields: list[int],
|
||||
) -> Literal["OK"]:
|
||||
qs = Document.objects.filter(id__in=doc_ids).only("pk")
|
||||
affected_docs = list(qs.values_list("pk", flat=True))
|
||||
# Ensure add_custom_fields is a list of tuples, supports old API
|
||||
add_custom_fields = (
|
||||
add_custom_fields.items()
|
||||
if isinstance(add_custom_fields, dict)
|
||||
else [(field, None) for field in add_custom_fields]
|
||||
)
|
||||
|
||||
custom_fields = CustomField.objects.filter(
|
||||
id__in=[int(field) for field, _ in add_custom_fields],
|
||||
).distinct()
|
||||
for field_id, value in add_custom_fields:
|
||||
for doc_id in affected_docs:
|
||||
defaults = {}
|
||||
custom_field = custom_fields.get(id=field_id)
|
||||
if custom_field:
|
||||
value_field = CustomFieldInstance.TYPE_TO_DATA_STORE_NAME_MAP[
|
||||
custom_field.data_type
|
||||
]
|
||||
defaults[value_field] = value
|
||||
if (
|
||||
custom_field.data_type == CustomField.FieldDataType.DOCUMENTLINK
|
||||
and value
|
||||
and doc_id in value
|
||||
):
|
||||
# Prevent self-linking
|
||||
continue
|
||||
CustomFieldInstance.objects.update_or_create(
|
||||
document_id=doc_id,
|
||||
field_id=field_id,
|
||||
defaults=defaults,
|
||||
)
|
||||
if custom_field.data_type == CustomField.FieldDataType.DOCUMENTLINK:
|
||||
doc = Document.objects.get(id=doc_id)
|
||||
reflect_doclinks(doc, custom_field, value)
|
||||
|
||||
# For doc link fields that are being removed, remove symmetrical links
|
||||
for doclink_being_removed_instance in CustomFieldInstance.objects.filter(
|
||||
document_id__in=affected_docs,
|
||||
field__id__in=remove_custom_fields,
|
||||
field__data_type=CustomField.FieldDataType.DOCUMENTLINK,
|
||||
value_document_ids__isnull=False,
|
||||
):
|
||||
for target_doc_id in doclink_being_removed_instance.value:
|
||||
remove_doclink(
|
||||
document=Document.objects.get(
|
||||
id=doclink_being_removed_instance.document.id,
|
||||
),
|
||||
field=doclink_being_removed_instance.field,
|
||||
target_doc_id=target_doc_id,
|
||||
)
|
||||
|
||||
# Finally, remove the custom fields
|
||||
CustomFieldInstance.objects.filter(
|
||||
document_id__in=affected_docs,
|
||||
field_id__in=remove_custom_fields,
|
||||
).hard_delete()
|
||||
|
||||
bulk_update_documents.delay(document_ids=affected_docs)
|
||||
|
||||
return "OK"
|
||||
|
||||
|
||||
@shared_task
|
||||
def delete(doc_ids: list[int]) -> Literal["OK"]:
|
||||
try:
|
||||
Document.objects.filter(id__in=doc_ids).delete()
|
||||
|
||||
from documents import index
|
||||
|
||||
with index.open_index_writer() as writer:
|
||||
for id in doc_ids:
|
||||
index.remove_document_by_id(writer, id)
|
||||
|
||||
status_mgr = DocumentsStatusManager()
|
||||
status_mgr.send_documents_deleted(doc_ids)
|
||||
except Exception as e:
|
||||
if "Data too long for column" in str(e):
|
||||
logger.warning(
|
||||
"Detected a possible incompatible database column. See https://docs.paperless-ngx.com/troubleshooting/#convert-uuid-field",
|
||||
)
|
||||
logger.error(f"Error deleting documents: {e!s}")
|
||||
|
||||
return "OK"
|
||||
|
||||
|
||||
def reprocess(doc_ids: list[int]) -> Literal["OK"]:
|
||||
for document_id in doc_ids:
|
||||
update_document_content_maybe_archive_file.delay(
|
||||
document_id=document_id,
|
||||
)
|
||||
|
||||
return "OK"
|
||||
|
||||
|
||||
def set_permissions(
|
||||
doc_ids: list[int],
|
||||
set_permissions,
|
||||
*,
|
||||
owner=None,
|
||||
merge=False,
|
||||
) -> Literal["OK"]:
|
||||
qs = Document.objects.filter(id__in=doc_ids).select_related("owner")
|
||||
|
||||
if merge:
|
||||
# If merging, only set owner for documents that don't have an owner
|
||||
qs.filter(owner__isnull=True).update(owner=owner)
|
||||
else:
|
||||
qs.update(owner=owner)
|
||||
|
||||
for doc in qs:
|
||||
set_permissions_for_object(permissions=set_permissions, object=doc, merge=merge)
|
||||
|
||||
affected_docs = list(qs.values_list("pk", flat=True))
|
||||
|
||||
bulk_update_documents.delay(document_ids=affected_docs)
|
||||
|
||||
return "OK"
|
||||
|
||||
|
||||
def rotate(doc_ids: list[int], degrees: int) -> Literal["OK"]:
|
||||
logger.info(
|
||||
f"Attempting to rotate {len(doc_ids)} documents by {degrees} degrees.",
|
||||
)
|
||||
qs = Document.objects.filter(id__in=doc_ids)
|
||||
affected_docs: list[int] = []
|
||||
import pikepdf
|
||||
|
||||
rotate_tasks = []
|
||||
for doc in qs:
|
||||
if doc.mime_type != "application/pdf":
|
||||
logger.warning(
|
||||
f"Document {doc.id} is not a PDF, skipping rotation.",
|
||||
)
|
||||
continue
|
||||
try:
|
||||
with pikepdf.open(doc.source_path, allow_overwriting_input=True) as pdf:
|
||||
for page in pdf.pages:
|
||||
page.rotate(degrees, relative=True)
|
||||
pdf.save()
|
||||
doc.checksum = hashlib.md5(doc.source_path.read_bytes()).hexdigest()
|
||||
doc.save()
|
||||
rotate_tasks.append(
|
||||
update_document_content_maybe_archive_file.s(
|
||||
document_id=doc.id,
|
||||
),
|
||||
)
|
||||
logger.info(
|
||||
f"Rotated document {doc.id} by {degrees} degrees",
|
||||
)
|
||||
affected_docs.append(doc.id)
|
||||
except Exception as e:
|
||||
logger.exception(f"Error rotating document {doc.id}: {e}")
|
||||
|
||||
if len(affected_docs) > 0:
|
||||
bulk_update_task = bulk_update_documents.si(document_ids=affected_docs)
|
||||
chord(header=rotate_tasks, body=bulk_update_task).delay()
|
||||
|
||||
return "OK"
|
||||
|
||||
|
||||
def merge(
|
||||
doc_ids: list[int],
|
||||
*,
|
||||
metadata_document_id: int | None = None,
|
||||
delete_originals: bool = False,
|
||||
archive_fallback: bool = False,
|
||||
user: User | None = None,
|
||||
) -> Literal["OK"]:
|
||||
logger.info(
|
||||
f"Attempting to merge {len(doc_ids)} documents into a single document.",
|
||||
)
|
||||
qs = Document.objects.filter(id__in=doc_ids)
|
||||
affected_docs: list[int] = []
|
||||
import pikepdf
|
||||
|
||||
merged_pdf = pikepdf.new()
|
||||
version: str = merged_pdf.pdf_version
|
||||
# use doc_ids to preserve order
|
||||
for doc_id in doc_ids:
|
||||
doc = qs.get(id=doc_id)
|
||||
try:
|
||||
doc_path = (
|
||||
doc.archive_path
|
||||
if archive_fallback
|
||||
and doc.mime_type != "application/pdf"
|
||||
and doc.has_archive_version
|
||||
else doc.source_path
|
||||
)
|
||||
with pikepdf.open(str(doc_path)) as pdf:
|
||||
version = max(version, pdf.pdf_version)
|
||||
merged_pdf.pages.extend(pdf.pages)
|
||||
affected_docs.append(doc.id)
|
||||
except Exception as e:
|
||||
logger.exception(
|
||||
f"Error merging document {doc.id}, it will not be included in the merge: {e}",
|
||||
)
|
||||
if len(affected_docs) == 0:
|
||||
logger.warning("No documents were merged")
|
||||
return "OK"
|
||||
|
||||
filepath = (
|
||||
Path(
|
||||
tempfile.mkdtemp(dir=settings.SCRATCH_DIR),
|
||||
)
|
||||
/ f"{'_'.join([str(doc_id) for doc_id in affected_docs])[:100]}_merged.pdf"
|
||||
)
|
||||
merged_pdf.remove_unreferenced_resources()
|
||||
merged_pdf.save(filepath, min_version=version)
|
||||
merged_pdf.close()
|
||||
|
||||
if metadata_document_id:
|
||||
metadata_document = qs.get(id=metadata_document_id)
|
||||
if metadata_document is not None:
|
||||
overrides: DocumentMetadataOverrides = (
|
||||
DocumentMetadataOverrides.from_document(metadata_document)
|
||||
)
|
||||
overrides.title = metadata_document.title + " (merged)"
|
||||
else:
|
||||
overrides = DocumentMetadataOverrides()
|
||||
else:
|
||||
overrides = DocumentMetadataOverrides()
|
||||
|
||||
if user is not None:
|
||||
overrides.owner_id = user.id
|
||||
|
||||
logger.info("Adding merged document to the task queue.")
|
||||
|
||||
consume_task = consume_file.s(
|
||||
ConsumableDocument(
|
||||
source=DocumentSource.ConsumeFolder,
|
||||
original_file=filepath,
|
||||
),
|
||||
overrides,
|
||||
)
|
||||
|
||||
if delete_originals:
|
||||
logger.info(
|
||||
"Queueing removal of original documents after consumption of merged document",
|
||||
)
|
||||
chain(consume_task, delete.si(affected_docs)).delay()
|
||||
else:
|
||||
consume_task.delay()
|
||||
|
||||
return "OK"
|
||||
|
||||
|
||||
def split(
|
||||
doc_ids: list[int],
|
||||
pages: list[list[int]],
|
||||
*,
|
||||
delete_originals: bool = False,
|
||||
user: User | None = None,
|
||||
) -> Literal["OK"]:
|
||||
logger.info(
|
||||
f"Attempting to split document {doc_ids[0]} into {len(pages)} documents",
|
||||
)
|
||||
doc = Document.objects.get(id=doc_ids[0])
|
||||
import pikepdf
|
||||
|
||||
consume_tasks = []
|
||||
|
||||
try:
|
||||
with pikepdf.open(doc.source_path) as pdf:
|
||||
for idx, split_doc in enumerate(pages):
|
||||
dst: pikepdf.Pdf = pikepdf.new()
|
||||
for page in split_doc:
|
||||
dst.pages.append(pdf.pages[page - 1])
|
||||
filepath: Path = (
|
||||
Path(
|
||||
tempfile.mkdtemp(dir=settings.SCRATCH_DIR),
|
||||
)
|
||||
/ f"{doc.id}_{split_doc[0]}-{split_doc[-1]}.pdf"
|
||||
)
|
||||
dst.remove_unreferenced_resources()
|
||||
dst.save(filepath)
|
||||
dst.close()
|
||||
|
||||
overrides: DocumentMetadataOverrides = (
|
||||
DocumentMetadataOverrides().from_document(doc)
|
||||
)
|
||||
overrides.title = f"{doc.title} (split {idx + 1})"
|
||||
if user is not None:
|
||||
overrides.owner_id = user.id
|
||||
logger.info(
|
||||
f"Adding split document with pages {split_doc} to the task queue.",
|
||||
)
|
||||
consume_tasks.append(
|
||||
consume_file.s(
|
||||
ConsumableDocument(
|
||||
source=DocumentSource.ConsumeFolder,
|
||||
original_file=filepath,
|
||||
),
|
||||
overrides,
|
||||
),
|
||||
)
|
||||
|
||||
if delete_originals:
|
||||
logger.info(
|
||||
"Queueing removal of original document after consumption of the split documents",
|
||||
)
|
||||
chord(header=consume_tasks, body=delete.si([doc.id])).delay()
|
||||
else:
|
||||
group(consume_tasks).delay()
|
||||
|
||||
except Exception as e:
|
||||
logger.exception(f"Error splitting document {doc.id}: {e}")
|
||||
|
||||
return "OK"
|
||||
|
||||
|
||||
def delete_pages(doc_ids: list[int], pages: list[int]) -> Literal["OK"]:
|
||||
logger.info(
|
||||
f"Attempting to delete pages {pages} from {len(doc_ids)} documents",
|
||||
)
|
||||
doc = Document.objects.get(id=doc_ids[0])
|
||||
pages = sorted(pages) # sort pages to avoid index issues
|
||||
import pikepdf
|
||||
|
||||
try:
|
||||
with pikepdf.open(doc.source_path, allow_overwriting_input=True) as pdf:
|
||||
offset = 1 # pages are 1-indexed
|
||||
for page_num in pages:
|
||||
pdf.pages.remove(pdf.pages[page_num - offset])
|
||||
offset += 1 # remove() changes the index of the pages
|
||||
pdf.remove_unreferenced_resources()
|
||||
pdf.save()
|
||||
doc.checksum = hashlib.md5(doc.source_path.read_bytes()).hexdigest()
|
||||
if doc.page_count is not None:
|
||||
doc.page_count = doc.page_count - len(pages)
|
||||
doc.save()
|
||||
update_document_content_maybe_archive_file.delay(document_id=doc.id)
|
||||
logger.info(f"Deleted pages {pages} from document {doc.id}")
|
||||
except Exception as e:
|
||||
logger.exception(f"Error deleting pages from document {doc.id}: {e}")
|
||||
|
||||
return "OK"
|
||||
|
||||
|
||||
def edit_pdf(
|
||||
doc_ids: list[int],
|
||||
operations: list[dict],
|
||||
*,
|
||||
delete_original: bool = False,
|
||||
update_document: bool = False,
|
||||
include_metadata: bool = True,
|
||||
user: User | None = None,
|
||||
) -> Literal["OK"]:
|
||||
"""
|
||||
Operations is a list of dictionaries describing the final PDF pages.
|
||||
Each entry must contain the original page number in `page` and may
|
||||
specify `rotate` in degrees and `doc` indicating the output
|
||||
document index (for splitting). Pages omitted from the list are
|
||||
discarded.
|
||||
"""
|
||||
|
||||
logger.info(
|
||||
f"Editing PDF of document {doc_ids[0]} with {len(operations)} operations",
|
||||
)
|
||||
doc = Document.objects.get(id=doc_ids[0])
|
||||
import pikepdf
|
||||
|
||||
pdf_docs: list[pikepdf.Pdf] = []
|
||||
|
||||
try:
|
||||
with pikepdf.open(doc.source_path) as src:
|
||||
# prepare output documents
|
||||
max_idx = max(op.get("doc", 0) for op in operations)
|
||||
pdf_docs = [pikepdf.new() for _ in range(max_idx + 1)]
|
||||
|
||||
if update_document and len(pdf_docs) > 1:
|
||||
logger.error(
|
||||
"Update requested but multiple output documents specified",
|
||||
)
|
||||
raise ValueError("Multiple output documents specified")
|
||||
|
||||
for op in operations:
|
||||
dst = pdf_docs[op.get("doc", 0)]
|
||||
page = src.pages[op["page"] - 1]
|
||||
dst.pages.append(page)
|
||||
if op.get("rotate"):
|
||||
dst.pages[-1].rotate(op["rotate"], relative=True)
|
||||
|
||||
if update_document:
|
||||
temp_path = doc.source_path.with_suffix(".tmp.pdf")
|
||||
pdf = pdf_docs[0]
|
||||
pdf.remove_unreferenced_resources()
|
||||
# save the edited PDF to a temporary file in case of errors
|
||||
pdf.save(temp_path)
|
||||
# replace the original document with the edited one
|
||||
temp_path.replace(doc.source_path)
|
||||
doc.checksum = hashlib.md5(doc.source_path.read_bytes()).hexdigest()
|
||||
doc.page_count = len(pdf.pages)
|
||||
doc.save()
|
||||
update_document_content_maybe_archive_file.delay(document_id=doc.id)
|
||||
else:
|
||||
consume_tasks = []
|
||||
overrides = (
|
||||
DocumentMetadataOverrides().from_document(doc)
|
||||
if include_metadata
|
||||
else DocumentMetadataOverrides()
|
||||
)
|
||||
if user is not None:
|
||||
overrides.owner_id = user.id
|
||||
|
||||
for idx, pdf in enumerate(pdf_docs, start=1):
|
||||
filepath: Path = (
|
||||
Path(tempfile.mkdtemp(dir=settings.SCRATCH_DIR))
|
||||
/ f"{doc.id}_edit_{idx}.pdf"
|
||||
)
|
||||
pdf.remove_unreferenced_resources()
|
||||
pdf.save(filepath)
|
||||
consume_tasks.append(
|
||||
consume_file.s(
|
||||
ConsumableDocument(
|
||||
source=DocumentSource.ConsumeFolder,
|
||||
original_file=filepath,
|
||||
),
|
||||
overrides,
|
||||
),
|
||||
)
|
||||
|
||||
if delete_original:
|
||||
chord(header=consume_tasks, body=delete.si([doc.id])).delay()
|
||||
else:
|
||||
group(consume_tasks).delay()
|
||||
|
||||
except Exception as e:
|
||||
logger.exception(f"Error editing document {doc.id}: {e}")
|
||||
raise ValueError(
|
||||
f"An error occurred while editing the document: {e}",
|
||||
) from e
|
||||
|
||||
return "OK"
|
||||
|
||||
|
||||
def reflect_doclinks(
|
||||
document: Document,
|
||||
field: CustomField,
|
||||
target_doc_ids: list[int],
|
||||
):
|
||||
"""
|
||||
Add or remove 'symmetrical' links to `document` on all `target_doc_ids`
|
||||
"""
|
||||
|
||||
if target_doc_ids is None:
|
||||
target_doc_ids = []
|
||||
|
||||
# Check if any documents are going to be removed from the current list of links and remove the symmetrical links
|
||||
current_field_instance = CustomFieldInstance.objects.filter(
|
||||
field=field,
|
||||
document=document,
|
||||
).first()
|
||||
if current_field_instance is not None and current_field_instance.value is not None:
|
||||
for doc_id in current_field_instance.value:
|
||||
if doc_id not in target_doc_ids:
|
||||
remove_doclink(
|
||||
document=document,
|
||||
field=field,
|
||||
target_doc_id=doc_id,
|
||||
)
|
||||
|
||||
# Create an instance if target doc doesn't have this field or append it to an existing one
|
||||
existing_custom_field_instances = {
|
||||
custom_field.document_id: custom_field
|
||||
for custom_field in CustomFieldInstance.objects.filter(
|
||||
field=field,
|
||||
document_id__in=target_doc_ids,
|
||||
)
|
||||
}
|
||||
custom_field_instances_to_create = []
|
||||
custom_field_instances_to_update = []
|
||||
for target_doc_id in target_doc_ids:
|
||||
target_doc_field_instance = existing_custom_field_instances.get(
|
||||
target_doc_id,
|
||||
)
|
||||
if target_doc_field_instance is None:
|
||||
custom_field_instances_to_create.append(
|
||||
CustomFieldInstance(
|
||||
document_id=target_doc_id,
|
||||
field=field,
|
||||
value_document_ids=[document.id],
|
||||
),
|
||||
)
|
||||
elif target_doc_field_instance.value is None:
|
||||
target_doc_field_instance.value_document_ids = [document.id]
|
||||
custom_field_instances_to_update.append(target_doc_field_instance)
|
||||
elif document.id not in target_doc_field_instance.value:
|
||||
target_doc_field_instance.value_document_ids.append(document.id)
|
||||
custom_field_instances_to_update.append(target_doc_field_instance)
|
||||
|
||||
CustomFieldInstance.objects.bulk_create(custom_field_instances_to_create)
|
||||
CustomFieldInstance.objects.bulk_update(
|
||||
custom_field_instances_to_update,
|
||||
["value_document_ids"],
|
||||
)
|
||||
Document.objects.filter(id__in=target_doc_ids).update(modified=timezone.now())
|
||||
|
||||
|
||||
def remove_doclink(
|
||||
document: Document,
|
||||
field: CustomField,
|
||||
target_doc_id: int,
|
||||
):
|
||||
"""
|
||||
Removes a 'symmetrical' link to `document` from the target document's existing custom field instance
|
||||
"""
|
||||
target_doc_field_instance = CustomFieldInstance.objects.filter(
|
||||
document_id=target_doc_id,
|
||||
field=field,
|
||||
).first()
|
||||
if (
|
||||
target_doc_field_instance is not None
|
||||
and document.id in target_doc_field_instance.value
|
||||
):
|
||||
target_doc_field_instance.value.remove(document.id)
|
||||
target_doc_field_instance.save()
|
||||
Document.objects.filter(id=target_doc_id).update(modified=timezone.now())
|
||||
296
src/documents/caching.py
Normal file
296
src/documents/caching.py
Normal file
@@ -0,0 +1,296 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import pickle
|
||||
from binascii import hexlify
|
||||
from collections import OrderedDict
|
||||
from dataclasses import dataclass
|
||||
from typing import TYPE_CHECKING
|
||||
from typing import Any
|
||||
from typing import Final
|
||||
|
||||
from django.conf import settings
|
||||
from django.core.cache import cache
|
||||
from django.core.cache import caches
|
||||
|
||||
from documents.models import Document
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from django.core.cache.backends.base import BaseCache
|
||||
|
||||
from documents.classifier import DocumentClassifier
|
||||
|
||||
logger = logging.getLogger("paperless.caching")
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class MetadataCacheData:
|
||||
original_checksum: str
|
||||
original_metadata: list
|
||||
archive_checksum: str | None
|
||||
archive_metadata: list | None
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class SuggestionCacheData:
|
||||
classifier_version: int
|
||||
classifier_hash: str
|
||||
suggestions: dict
|
||||
|
||||
|
||||
CLASSIFIER_VERSION_KEY: Final[str] = "classifier_version"
|
||||
CLASSIFIER_HASH_KEY: Final[str] = "classifier_hash"
|
||||
CLASSIFIER_MODIFIED_KEY: Final[str] = "classifier_modified"
|
||||
|
||||
CACHE_1_MINUTE: Final[int] = 60
|
||||
CACHE_5_MINUTES: Final[int] = 5 * CACHE_1_MINUTE
|
||||
CACHE_50_MINUTES: Final[int] = 50 * CACHE_1_MINUTE
|
||||
|
||||
read_cache = caches["read-cache"]
|
||||
|
||||
|
||||
class LRUCache:
|
||||
def __init__(self, capacity: int = 128):
|
||||
self._data = OrderedDict()
|
||||
self.capacity = capacity
|
||||
|
||||
def get(self, key, default=None) -> Any | None:
|
||||
if key in self._data:
|
||||
self._data.move_to_end(key)
|
||||
return self._data[key]
|
||||
return default
|
||||
|
||||
def set(self, key, value) -> None:
|
||||
self._data[key] = value
|
||||
self._data.move_to_end(key)
|
||||
while len(self._data) > self.capacity:
|
||||
self._data.popitem(last=False)
|
||||
|
||||
|
||||
class StoredLRUCache(LRUCache):
|
||||
"""
|
||||
LRU cache that can persist its entire contents as a single entry in a backend cache.
|
||||
|
||||
Useful for sharing a cache across multiple workers or processes.
|
||||
|
||||
Workflow:
|
||||
1. Load the cache state from the backend using `load()`.
|
||||
2. Use `get()` and `set()` locally as usual.
|
||||
3. Persist changes back to the backend using `save()`.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
backend_key: str,
|
||||
capacity: int = 128,
|
||||
backend: BaseCache = read_cache,
|
||||
backend_ttl=settings.CACHALOT_TIMEOUT,
|
||||
):
|
||||
if backend_key is None:
|
||||
raise ValueError("backend_key is mandatory")
|
||||
super().__init__(capacity)
|
||||
self._backend_key = backend_key
|
||||
self._backend = backend
|
||||
self.backend_ttl = backend_ttl
|
||||
|
||||
def load(self) -> None:
|
||||
"""
|
||||
Load the whole cache content from backend storage.
|
||||
|
||||
If no valid cached data exists in the backend, the local cache is cleared.
|
||||
"""
|
||||
serialized_data = self._backend.get(self._backend_key)
|
||||
try:
|
||||
self._data = (
|
||||
pickle.loads(serialized_data) if serialized_data else OrderedDict()
|
||||
)
|
||||
except pickle.PickleError:
|
||||
logger.warning(
|
||||
"Cache exists in backend but could not be read (possibly invalid format)",
|
||||
)
|
||||
|
||||
def save(self) -> None:
|
||||
"""Save the entire local cache to the backend as a serialized object.
|
||||
|
||||
The backend entry will expire after the configured TTL.
|
||||
"""
|
||||
self._backend.set(
|
||||
self._backend_key,
|
||||
pickle.dumps(self._data),
|
||||
self.backend_ttl,
|
||||
)
|
||||
|
||||
|
||||
def get_suggestion_cache_key(document_id: int) -> str:
|
||||
"""
|
||||
Returns the basic key for a document's suggestions
|
||||
"""
|
||||
return f"doc_{document_id}_suggest"
|
||||
|
||||
|
||||
def get_suggestion_cache(document_id: int) -> SuggestionCacheData | None:
|
||||
"""
|
||||
If possible, return the cached suggestions for the given document ID.
|
||||
The classifier needs to be matching in format and hash and the suggestions need to
|
||||
have been cached once.
|
||||
"""
|
||||
from documents.classifier import DocumentClassifier
|
||||
|
||||
doc_key = get_suggestion_cache_key(document_id)
|
||||
cache_hits = cache.get_many([CLASSIFIER_VERSION_KEY, CLASSIFIER_HASH_KEY, doc_key])
|
||||
# The document suggestions are in the cache
|
||||
if doc_key in cache_hits:
|
||||
doc_suggestions: SuggestionCacheData = cache_hits[doc_key]
|
||||
# The classifier format is the same
|
||||
# The classifier hash is the same
|
||||
# Then the suggestions can be used
|
||||
if (
|
||||
CLASSIFIER_VERSION_KEY in cache_hits
|
||||
and cache_hits[CLASSIFIER_VERSION_KEY] == DocumentClassifier.FORMAT_VERSION
|
||||
and cache_hits[CLASSIFIER_VERSION_KEY] == doc_suggestions.classifier_version
|
||||
) and (
|
||||
CLASSIFIER_HASH_KEY in cache_hits
|
||||
and cache_hits[CLASSIFIER_HASH_KEY] == doc_suggestions.classifier_hash
|
||||
):
|
||||
return doc_suggestions
|
||||
else: # pragma: no cover
|
||||
# Remove the key because something didn't match
|
||||
cache.delete(doc_key)
|
||||
return None
|
||||
|
||||
|
||||
def set_suggestions_cache(
|
||||
document_id: int,
|
||||
suggestions: dict,
|
||||
classifier: DocumentClassifier | None,
|
||||
*,
|
||||
timeout=CACHE_50_MINUTES,
|
||||
) -> None:
|
||||
"""
|
||||
Caches the given suggestions, which were generated by the given classifier. If there is no classifier,
|
||||
this function is a no-op (there won't be suggestions then anyway)
|
||||
"""
|
||||
if classifier is not None:
|
||||
doc_key = get_suggestion_cache_key(document_id)
|
||||
cache.set(
|
||||
doc_key,
|
||||
SuggestionCacheData(
|
||||
classifier.FORMAT_VERSION,
|
||||
hexlify(classifier.last_auto_type_hash).decode(),
|
||||
suggestions,
|
||||
),
|
||||
timeout,
|
||||
)
|
||||
|
||||
|
||||
def refresh_suggestions_cache(
|
||||
document_id: int,
|
||||
*,
|
||||
timeout: int = CACHE_50_MINUTES,
|
||||
) -> None:
|
||||
"""
|
||||
Refreshes the expiration of the suggestions for the given document ID
|
||||
to the given timeout
|
||||
"""
|
||||
doc_key = get_suggestion_cache_key(document_id)
|
||||
cache.touch(doc_key, timeout)
|
||||
|
||||
|
||||
def get_metadata_cache_key(document_id: int) -> str:
|
||||
"""
|
||||
Returns the basic key for a document's metadata
|
||||
"""
|
||||
return f"doc_{document_id}_metadata"
|
||||
|
||||
|
||||
def get_metadata_cache(document_id: int) -> MetadataCacheData | None:
|
||||
"""
|
||||
Returns the cached document metadata for the given document ID, as long as the metadata
|
||||
was cached once and the checksums have not changed
|
||||
"""
|
||||
doc_key = get_metadata_cache_key(document_id)
|
||||
doc_metadata: MetadataCacheData | None = cache.get(doc_key)
|
||||
# The metadata exists in the cache
|
||||
if doc_metadata is not None:
|
||||
try:
|
||||
doc = Document.objects.only(
|
||||
"pk",
|
||||
"checksum",
|
||||
"archive_checksum",
|
||||
"archive_filename",
|
||||
).get(pk=document_id)
|
||||
# The original checksums match
|
||||
# If it has one, the archive checksums match
|
||||
# Then, we can use the metadata
|
||||
if (
|
||||
doc_metadata.original_checksum == doc.checksum
|
||||
and doc.has_archive_version
|
||||
and doc_metadata.archive_checksum is not None
|
||||
and doc_metadata.archive_checksum == doc.archive_checksum
|
||||
):
|
||||
# Refresh cache
|
||||
cache.touch(doc_key, CACHE_50_MINUTES)
|
||||
return doc_metadata
|
||||
else: # pragma: no cover
|
||||
# Something didn't match, delete the key
|
||||
cache.delete(doc_key)
|
||||
except Document.DoesNotExist: # pragma: no cover
|
||||
# Basically impossible, but the key existed, but the Document didn't
|
||||
cache.delete(doc_key)
|
||||
return None
|
||||
|
||||
|
||||
def set_metadata_cache(
|
||||
document: Document,
|
||||
original_metadata: list,
|
||||
archive_metadata: list | None,
|
||||
*,
|
||||
timeout=CACHE_50_MINUTES,
|
||||
) -> None:
|
||||
"""
|
||||
Sets the metadata into cache for the given Document
|
||||
"""
|
||||
doc_key = get_metadata_cache_key(document.pk)
|
||||
cache.set(
|
||||
doc_key,
|
||||
MetadataCacheData(
|
||||
document.checksum,
|
||||
original_metadata,
|
||||
document.archive_checksum,
|
||||
archive_metadata,
|
||||
),
|
||||
timeout,
|
||||
)
|
||||
|
||||
|
||||
def refresh_metadata_cache(
|
||||
document_id: int,
|
||||
*,
|
||||
timeout: int = CACHE_50_MINUTES,
|
||||
) -> None:
|
||||
"""
|
||||
Refreshes the expiration of the metadata for the given document ID
|
||||
to the given timeout
|
||||
"""
|
||||
doc_key = get_metadata_cache_key(document_id)
|
||||
cache.touch(doc_key, timeout)
|
||||
|
||||
|
||||
def get_thumbnail_modified_key(document_id: int) -> str:
|
||||
"""
|
||||
Builds the key to store a thumbnail's timestamp
|
||||
"""
|
||||
return f"doc_{document_id}_thumbnail_modified"
|
||||
|
||||
|
||||
def clear_document_caches(document_id: int) -> None:
|
||||
"""
|
||||
Removes all cached items for the given document
|
||||
"""
|
||||
cache.delete_many(
|
||||
[
|
||||
get_suggestion_cache_key(document_id),
|
||||
get_metadata_cache_key(document_id),
|
||||
get_thumbnail_modified_key(document_id),
|
||||
],
|
||||
)
|
||||
88
src/documents/checks.py
Normal file
88
src/documents/checks.py
Normal file
@@ -0,0 +1,88 @@
|
||||
import textwrap
|
||||
|
||||
from django.conf import settings
|
||||
from django.core.checks import Error
|
||||
from django.core.checks import Warning
|
||||
from django.core.checks import register
|
||||
from django.core.exceptions import FieldError
|
||||
from django.db.utils import OperationalError
|
||||
from django.db.utils import ProgrammingError
|
||||
|
||||
from documents.signals import document_consumer_declaration
|
||||
from documents.templating.utils import convert_format_str_to_template_format
|
||||
|
||||
|
||||
@register()
|
||||
def changed_password_check(app_configs, **kwargs):
|
||||
from documents.models import Document
|
||||
from paperless.db import GnuPG
|
||||
|
||||
try:
|
||||
encrypted_doc = (
|
||||
Document.objects.filter(
|
||||
storage_type=Document.STORAGE_TYPE_GPG,
|
||||
)
|
||||
.only("pk", "storage_type")
|
||||
.first()
|
||||
)
|
||||
except (OperationalError, ProgrammingError, FieldError):
|
||||
return [] # No documents table yet
|
||||
|
||||
if encrypted_doc:
|
||||
if not settings.PASSPHRASE:
|
||||
return [
|
||||
Error(
|
||||
"The database contains encrypted documents but no password is set.",
|
||||
),
|
||||
]
|
||||
|
||||
if not GnuPG.decrypted(encrypted_doc.source_file):
|
||||
return [
|
||||
Error(
|
||||
textwrap.dedent(
|
||||
"""
|
||||
The current password doesn't match the password of the
|
||||
existing documents.
|
||||
|
||||
If you intend to change your password, you must first export
|
||||
all of the old documents, start fresh with the new password
|
||||
and then re-import them."
|
||||
""",
|
||||
),
|
||||
),
|
||||
]
|
||||
|
||||
return []
|
||||
|
||||
|
||||
@register()
|
||||
def parser_check(app_configs, **kwargs):
|
||||
parsers = []
|
||||
for response in document_consumer_declaration.send(None):
|
||||
parsers.append(response[1])
|
||||
|
||||
if len(parsers) == 0:
|
||||
return [
|
||||
Error(
|
||||
"No parsers found. This is a bug. The consumer won't be "
|
||||
"able to consume any documents without parsers.",
|
||||
),
|
||||
]
|
||||
else:
|
||||
return []
|
||||
|
||||
|
||||
@register()
|
||||
def filename_format_check(app_configs, **kwargs):
|
||||
if settings.FILENAME_FORMAT:
|
||||
converted_format = convert_format_str_to_template_format(
|
||||
settings.FILENAME_FORMAT,
|
||||
)
|
||||
if converted_format != settings.FILENAME_FORMAT:
|
||||
return [
|
||||
Warning(
|
||||
f"Filename format {settings.FILENAME_FORMAT} is using the old style, please update to use double curly brackets",
|
||||
hint=converted_format,
|
||||
),
|
||||
]
|
||||
return []
|
||||
546
src/documents/classifier.py
Normal file
546
src/documents/classifier.py
Normal file
@@ -0,0 +1,546 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import pickle
|
||||
import re
|
||||
import warnings
|
||||
from hashlib import sha256
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Iterator
|
||||
from datetime import datetime
|
||||
|
||||
from numpy import ndarray
|
||||
|
||||
from django.conf import settings
|
||||
from django.core.cache import cache
|
||||
from django.core.cache import caches
|
||||
|
||||
from documents.caching import CACHE_5_MINUTES
|
||||
from documents.caching import CACHE_50_MINUTES
|
||||
from documents.caching import CLASSIFIER_HASH_KEY
|
||||
from documents.caching import CLASSIFIER_MODIFIED_KEY
|
||||
from documents.caching import CLASSIFIER_VERSION_KEY
|
||||
from documents.caching import StoredLRUCache
|
||||
from documents.models import Document
|
||||
from documents.models import MatchingModel
|
||||
|
||||
logger = logging.getLogger("paperless.classifier")
|
||||
|
||||
ADVANCED_TEXT_PROCESSING_ENABLED = (
|
||||
settings.NLTK_LANGUAGE is not None and settings.NLTK_ENABLED
|
||||
)
|
||||
|
||||
read_cache = caches["read-cache"]
|
||||
|
||||
|
||||
RE_DIGIT = re.compile(r"\d")
|
||||
RE_WORD = re.compile(r"\b[\w]+\b") # words that may contain digits
|
||||
|
||||
|
||||
class IncompatibleClassifierVersionError(Exception):
|
||||
def __init__(self, message: str, *args: object) -> None:
|
||||
self.message: str = message
|
||||
super().__init__(*args)
|
||||
|
||||
|
||||
class ClassifierModelCorruptError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
def load_classifier(*, raise_exception: bool = False) -> DocumentClassifier | None:
|
||||
if not settings.MODEL_FILE.is_file():
|
||||
logger.debug(
|
||||
"Document classification model does not exist (yet), not "
|
||||
"performing automatic matching.",
|
||||
)
|
||||
return None
|
||||
|
||||
classifier = DocumentClassifier()
|
||||
try:
|
||||
classifier.load()
|
||||
|
||||
except IncompatibleClassifierVersionError as e:
|
||||
logger.info(f"Classifier version incompatible: {e.message}, will re-train")
|
||||
Path(settings.MODEL_FILE).unlink()
|
||||
classifier = None
|
||||
if raise_exception:
|
||||
raise e
|
||||
except ClassifierModelCorruptError as e:
|
||||
# there's something wrong with the model file.
|
||||
logger.exception(
|
||||
"Unrecoverable error while loading document "
|
||||
"classification model, deleting model file.",
|
||||
)
|
||||
Path(settings.MODEL_FILE).unlink
|
||||
classifier = None
|
||||
if raise_exception:
|
||||
raise e
|
||||
except OSError as e:
|
||||
logger.exception("IO error while loading document classification model")
|
||||
classifier = None
|
||||
if raise_exception:
|
||||
raise e
|
||||
except Exception as e: # pragma: no cover
|
||||
logger.exception("Unknown error while loading document classification model")
|
||||
classifier = None
|
||||
if raise_exception:
|
||||
raise e
|
||||
|
||||
return classifier
|
||||
|
||||
|
||||
class DocumentClassifier:
|
||||
# v7 - Updated scikit-learn package version
|
||||
# v8 - Added storage path classifier
|
||||
# v9 - Changed from hashing to time/ids for re-train check
|
||||
FORMAT_VERSION = 9
|
||||
|
||||
def __init__(self) -> None:
|
||||
# last time a document changed and therefore training might be required
|
||||
self.last_doc_change_time: datetime | None = None
|
||||
# Hash of primary keys of AUTO matching values last used in training
|
||||
self.last_auto_type_hash: bytes | None = None
|
||||
|
||||
self.data_vectorizer = None
|
||||
self.data_vectorizer_hash = None
|
||||
self.tags_binarizer = None
|
||||
self.tags_classifier = None
|
||||
self.correspondent_classifier = None
|
||||
self.document_type_classifier = None
|
||||
self.storage_path_classifier = None
|
||||
self._stemmer = None
|
||||
# 10,000 elements roughly use 200 to 500 KB per worker,
|
||||
# and also in the shared Redis cache,
|
||||
# Keep this cache small to minimize lookup and I/O latency.
|
||||
if ADVANCED_TEXT_PROCESSING_ENABLED:
|
||||
self._stem_cache = StoredLRUCache(
|
||||
f"stem_cache_v{self.FORMAT_VERSION}",
|
||||
capacity=10000,
|
||||
)
|
||||
self._stop_words = None
|
||||
|
||||
def _update_data_vectorizer_hash(self):
|
||||
self.data_vectorizer_hash = sha256(
|
||||
pickle.dumps(self.data_vectorizer),
|
||||
).hexdigest()
|
||||
|
||||
def load(self) -> None:
|
||||
from sklearn.exceptions import InconsistentVersionWarning
|
||||
|
||||
# Catch warnings for processing
|
||||
with warnings.catch_warnings(record=True) as w:
|
||||
with Path(settings.MODEL_FILE).open("rb") as f:
|
||||
schema_version = pickle.load(f)
|
||||
|
||||
if schema_version != self.FORMAT_VERSION:
|
||||
raise IncompatibleClassifierVersionError(
|
||||
"Cannot load classifier, incompatible versions.",
|
||||
)
|
||||
else:
|
||||
try:
|
||||
self.last_doc_change_time = pickle.load(f)
|
||||
self.last_auto_type_hash = pickle.load(f)
|
||||
|
||||
self.data_vectorizer = pickle.load(f)
|
||||
self._update_data_vectorizer_hash()
|
||||
self.tags_binarizer = pickle.load(f)
|
||||
|
||||
self.tags_classifier = pickle.load(f)
|
||||
self.correspondent_classifier = pickle.load(f)
|
||||
self.document_type_classifier = pickle.load(f)
|
||||
self.storage_path_classifier = pickle.load(f)
|
||||
except Exception as err:
|
||||
raise ClassifierModelCorruptError from err
|
||||
|
||||
# Check for the warning about unpickling from differing versions
|
||||
# and consider it incompatible
|
||||
sk_learn_warning_url = (
|
||||
"https://scikit-learn.org/stable/"
|
||||
"model_persistence.html"
|
||||
"#security-maintainability-limitations"
|
||||
)
|
||||
for warning in w:
|
||||
# The warning is inconsistent, the MLPClassifier is a specific warning, others have not updated yet
|
||||
if issubclass(warning.category, InconsistentVersionWarning) or (
|
||||
issubclass(warning.category, UserWarning)
|
||||
and sk_learn_warning_url in str(warning.message)
|
||||
):
|
||||
raise IncompatibleClassifierVersionError("sklearn version update")
|
||||
|
||||
def save(self) -> None:
|
||||
target_file: Path = settings.MODEL_FILE
|
||||
target_file_temp: Path = target_file.with_suffix(".pickle.part")
|
||||
|
||||
with target_file_temp.open("wb") as f:
|
||||
pickle.dump(self.FORMAT_VERSION, f)
|
||||
|
||||
pickle.dump(self.last_doc_change_time, f)
|
||||
pickle.dump(self.last_auto_type_hash, f)
|
||||
|
||||
pickle.dump(self.data_vectorizer, f)
|
||||
|
||||
pickle.dump(self.tags_binarizer, f)
|
||||
pickle.dump(self.tags_classifier, f)
|
||||
|
||||
pickle.dump(self.correspondent_classifier, f)
|
||||
pickle.dump(self.document_type_classifier, f)
|
||||
pickle.dump(self.storage_path_classifier, f)
|
||||
|
||||
target_file_temp.rename(target_file)
|
||||
|
||||
def train(self) -> bool:
|
||||
# Get non-inbox documents
|
||||
docs_queryset = (
|
||||
Document.objects.exclude(
|
||||
tags__is_inbox_tag=True,
|
||||
)
|
||||
.select_related("document_type", "correspondent", "storage_path")
|
||||
.prefetch_related("tags")
|
||||
.order_by("pk")
|
||||
)
|
||||
|
||||
# No documents exit to train against
|
||||
if docs_queryset.count() == 0:
|
||||
raise ValueError("No training data available.")
|
||||
|
||||
labels_tags = []
|
||||
labels_correspondent = []
|
||||
labels_document_type = []
|
||||
labels_storage_path = []
|
||||
|
||||
# Step 1: Extract and preprocess training data from the database.
|
||||
logger.debug("Gathering data from database...")
|
||||
hasher = sha256()
|
||||
for doc in docs_queryset:
|
||||
y = -1
|
||||
dt = doc.document_type
|
||||
if dt and dt.matching_algorithm == MatchingModel.MATCH_AUTO:
|
||||
y = dt.pk
|
||||
hasher.update(y.to_bytes(4, "little", signed=True))
|
||||
labels_document_type.append(y)
|
||||
|
||||
y = -1
|
||||
cor = doc.correspondent
|
||||
if cor and cor.matching_algorithm == MatchingModel.MATCH_AUTO:
|
||||
y = cor.pk
|
||||
hasher.update(y.to_bytes(4, "little", signed=True))
|
||||
labels_correspondent.append(y)
|
||||
|
||||
tags: list[int] = list(
|
||||
doc.tags.filter(matching_algorithm=MatchingModel.MATCH_AUTO)
|
||||
.order_by("pk")
|
||||
.values_list("pk", flat=True),
|
||||
)
|
||||
for tag in tags:
|
||||
hasher.update(tag.to_bytes(4, "little", signed=True))
|
||||
labels_tags.append(tags)
|
||||
|
||||
y = -1
|
||||
sp = doc.storage_path
|
||||
if sp and sp.matching_algorithm == MatchingModel.MATCH_AUTO:
|
||||
y = sp.pk
|
||||
hasher.update(y.to_bytes(4, "little", signed=True))
|
||||
labels_storage_path.append(y)
|
||||
|
||||
labels_tags_unique = {tag for tags in labels_tags for tag in tags}
|
||||
|
||||
num_tags = len(labels_tags_unique)
|
||||
|
||||
# Check if retraining is actually required.
|
||||
# A document has been updated since the classifier was trained
|
||||
# New auto tags, types, correspondent, storage paths exist
|
||||
latest_doc_change = docs_queryset.latest("modified").modified
|
||||
if (
|
||||
self.last_doc_change_time is not None
|
||||
and self.last_doc_change_time >= latest_doc_change
|
||||
) and self.last_auto_type_hash == hasher.digest():
|
||||
logger.info("No updates since last training")
|
||||
# Set the classifier information into the cache
|
||||
# Caching for 50 minutes, so slightly less than the normal retrain time
|
||||
cache.set(
|
||||
CLASSIFIER_MODIFIED_KEY,
|
||||
self.last_doc_change_time,
|
||||
CACHE_50_MINUTES,
|
||||
)
|
||||
cache.set(CLASSIFIER_HASH_KEY, hasher.hexdigest(), CACHE_50_MINUTES)
|
||||
cache.set(CLASSIFIER_VERSION_KEY, self.FORMAT_VERSION, CACHE_50_MINUTES)
|
||||
return False
|
||||
|
||||
# subtract 1 since -1 (null) is also part of the classes.
|
||||
|
||||
# union with {-1} accounts for cases where all documents have
|
||||
# correspondents and types assigned, so -1 isn't part of labels_x, which
|
||||
# it usually is.
|
||||
num_correspondents: int = len(set(labels_correspondent) | {-1}) - 1
|
||||
num_document_types: int = len(set(labels_document_type) | {-1}) - 1
|
||||
num_storage_paths: int = len(set(labels_storage_path) | {-1}) - 1
|
||||
|
||||
logger.debug(
|
||||
f"{docs_queryset.count()} documents, {num_tags} tag(s), {num_correspondents} correspondent(s), "
|
||||
f"{num_document_types} document type(s). {num_storage_paths} storage path(s)",
|
||||
)
|
||||
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
from sklearn.neural_network import MLPClassifier
|
||||
from sklearn.preprocessing import LabelBinarizer
|
||||
from sklearn.preprocessing import MultiLabelBinarizer
|
||||
|
||||
# Step 2: vectorize data
|
||||
logger.debug("Vectorizing data...")
|
||||
|
||||
def content_generator() -> Iterator[str]:
|
||||
"""
|
||||
Generates the content for documents, but once at a time
|
||||
"""
|
||||
for doc in docs_queryset:
|
||||
yield self.preprocess_content(doc.content, shared_cache=False)
|
||||
|
||||
self.data_vectorizer = CountVectorizer(
|
||||
analyzer="word",
|
||||
ngram_range=(1, 2),
|
||||
min_df=0.01,
|
||||
)
|
||||
|
||||
data_vectorized: ndarray = self.data_vectorizer.fit_transform(
|
||||
content_generator(),
|
||||
)
|
||||
|
||||
# See the notes here:
|
||||
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
|
||||
# This attribute isn't needed to function and can be large
|
||||
self.data_vectorizer.stop_words_ = None
|
||||
|
||||
# Step 3: train the classifiers
|
||||
if num_tags > 0:
|
||||
logger.debug("Training tags classifier...")
|
||||
|
||||
if num_tags == 1:
|
||||
# Special case where only one tag has auto:
|
||||
# Fallback to binary classification.
|
||||
labels_tags = [
|
||||
label[0] if len(label) == 1 else -1 for label in labels_tags
|
||||
]
|
||||
self.tags_binarizer = LabelBinarizer()
|
||||
labels_tags_vectorized: ndarray = self.tags_binarizer.fit_transform(
|
||||
labels_tags,
|
||||
).ravel()
|
||||
else:
|
||||
self.tags_binarizer = MultiLabelBinarizer()
|
||||
labels_tags_vectorized = self.tags_binarizer.fit_transform(labels_tags)
|
||||
|
||||
self.tags_classifier = MLPClassifier(tol=0.01)
|
||||
self.tags_classifier.fit(data_vectorized, labels_tags_vectorized)
|
||||
else:
|
||||
self.tags_classifier = None
|
||||
logger.debug("There are no tags. Not training tags classifier.")
|
||||
|
||||
if num_correspondents > 0:
|
||||
logger.debug("Training correspondent classifier...")
|
||||
self.correspondent_classifier = MLPClassifier(tol=0.01)
|
||||
self.correspondent_classifier.fit(data_vectorized, labels_correspondent)
|
||||
else:
|
||||
self.correspondent_classifier = None
|
||||
logger.debug(
|
||||
"There are no correspondents. Not training correspondent classifier.",
|
||||
)
|
||||
|
||||
if num_document_types > 0:
|
||||
logger.debug("Training document type classifier...")
|
||||
self.document_type_classifier = MLPClassifier(tol=0.01)
|
||||
self.document_type_classifier.fit(data_vectorized, labels_document_type)
|
||||
else:
|
||||
self.document_type_classifier = None
|
||||
logger.debug(
|
||||
"There are no document types. Not training document type classifier.",
|
||||
)
|
||||
|
||||
if num_storage_paths > 0:
|
||||
logger.debug(
|
||||
"Training storage paths classifier...",
|
||||
)
|
||||
self.storage_path_classifier = MLPClassifier(tol=0.01)
|
||||
self.storage_path_classifier.fit(
|
||||
data_vectorized,
|
||||
labels_storage_path,
|
||||
)
|
||||
else:
|
||||
self.storage_path_classifier = None
|
||||
logger.debug(
|
||||
"There are no storage paths. Not training storage path classifier.",
|
||||
)
|
||||
|
||||
self.last_doc_change_time = latest_doc_change
|
||||
self.last_auto_type_hash = hasher.digest()
|
||||
self._update_data_vectorizer_hash()
|
||||
|
||||
# Set the classifier information into the cache
|
||||
# Caching for 50 minutes, so slightly less than the normal retrain time
|
||||
cache.set(CLASSIFIER_MODIFIED_KEY, self.last_doc_change_time, CACHE_50_MINUTES)
|
||||
cache.set(CLASSIFIER_HASH_KEY, hasher.hexdigest(), CACHE_50_MINUTES)
|
||||
cache.set(CLASSIFIER_VERSION_KEY, self.FORMAT_VERSION, CACHE_50_MINUTES)
|
||||
|
||||
return True
|
||||
|
||||
def _init_advanced_text_processing(self):
|
||||
if self._stop_words is None or self._stemmer is None:
|
||||
import nltk
|
||||
from nltk.corpus import stopwords
|
||||
from nltk.stem import SnowballStemmer
|
||||
|
||||
# Not really hacky, since it isn't private and is documented, but
|
||||
# set the search path for NLTK data to the single location it should be in
|
||||
nltk.data.path = [settings.NLTK_DIR]
|
||||
try:
|
||||
# Preload the corpus early, to force the lazy loader to transform
|
||||
stopwords.ensure_loaded()
|
||||
|
||||
# Do some one time setup
|
||||
# Sometimes, somehow, there's multiple threads loading the corpus
|
||||
# and it's not thread safe, raising an AttributeError
|
||||
self._stemmer = SnowballStemmer(settings.NLTK_LANGUAGE)
|
||||
self._stop_words = frozenset(stopwords.words(settings.NLTK_LANGUAGE))
|
||||
except AttributeError:
|
||||
logger.debug("Could not initialize NLTK for advanced text processing.")
|
||||
return False
|
||||
return True
|
||||
|
||||
def stem_and_skip_stop_words(self, words: list[str], *, shared_cache=True):
|
||||
"""
|
||||
Reduce a list of words to their stem. Stop words are converted to empty strings.
|
||||
:param words: the list of words to stem
|
||||
"""
|
||||
|
||||
def _stem_and_skip_stop_word(word: str):
|
||||
"""
|
||||
Reduce a given word to its stem. If it's a stop word, return an empty string.
|
||||
E.g. "amazement", "amaze" and "amazed" all return "amaz".
|
||||
"""
|
||||
cached = self._stem_cache.get(word)
|
||||
if cached is not None:
|
||||
return cached
|
||||
elif word in self._stop_words:
|
||||
return ""
|
||||
# Assumption: words that contain numbers are never stemmed
|
||||
elif RE_DIGIT.search(word):
|
||||
return word
|
||||
else:
|
||||
result = self._stemmer.stem(word)
|
||||
self._stem_cache.set(word, result)
|
||||
return result
|
||||
|
||||
if shared_cache:
|
||||
self._stem_cache.load()
|
||||
|
||||
# Stem the words and skip stop words
|
||||
result = " ".join(
|
||||
filter(None, (_stem_and_skip_stop_word(w) for w in words)),
|
||||
)
|
||||
if shared_cache:
|
||||
self._stem_cache.save()
|
||||
return result
|
||||
|
||||
def preprocess_content(
|
||||
self,
|
||||
content: str,
|
||||
*,
|
||||
shared_cache=True,
|
||||
) -> str:
|
||||
"""
|
||||
Process the contents of a document, distilling it down into
|
||||
words which are meaningful to the content.
|
||||
|
||||
A stemmer cache is shared across workers with the parameter "shared_cache".
|
||||
This is unnecessary when training the classifier.
|
||||
"""
|
||||
|
||||
# Lower case the document, reduce space,
|
||||
# and keep only letters and digits.
|
||||
content = " ".join(match.group().lower() for match in RE_WORD.finditer(content))
|
||||
|
||||
if ADVANCED_TEXT_PROCESSING_ENABLED:
|
||||
from nltk.tokenize import word_tokenize
|
||||
|
||||
if not self._init_advanced_text_processing():
|
||||
return content
|
||||
# Tokenize
|
||||
# This splits the content into tokens, roughly words
|
||||
words = word_tokenize(content, language=settings.NLTK_LANGUAGE)
|
||||
# Stem the words and skip stop words
|
||||
content = self.stem_and_skip_stop_words(words, shared_cache=shared_cache)
|
||||
|
||||
return content
|
||||
|
||||
def _get_vectorizer_cache_key(self, content: str):
|
||||
hash = sha256(content.encode())
|
||||
hash.update(
|
||||
f"|{self.FORMAT_VERSION}|{settings.NLTK_LANGUAGE}|{settings.NLTK_ENABLED}|{self.data_vectorizer_hash}".encode(),
|
||||
)
|
||||
return f"vectorized_content_{hash.hexdigest()}"
|
||||
|
||||
def _vectorize(self, content: str):
|
||||
key = self._get_vectorizer_cache_key(content)
|
||||
serialized_result = read_cache.get(key)
|
||||
if serialized_result is None:
|
||||
result = self.data_vectorizer.transform([self.preprocess_content(content)])
|
||||
read_cache.set(key, pickle.dumps(result), CACHE_5_MINUTES)
|
||||
else:
|
||||
read_cache.touch(key, CACHE_5_MINUTES)
|
||||
result = pickle.loads(serialized_result)
|
||||
return result
|
||||
|
||||
def predict_correspondent(self, content: str) -> int | None:
|
||||
if self.correspondent_classifier:
|
||||
X = self._vectorize(content)
|
||||
correspondent_id = self.correspondent_classifier.predict(X)
|
||||
if correspondent_id != -1:
|
||||
return correspondent_id
|
||||
else:
|
||||
return None
|
||||
else:
|
||||
return None
|
||||
|
||||
def predict_document_type(self, content: str) -> int | None:
|
||||
if self.document_type_classifier:
|
||||
X = self._vectorize(content)
|
||||
document_type_id = self.document_type_classifier.predict(X)
|
||||
if document_type_id != -1:
|
||||
return document_type_id
|
||||
else:
|
||||
return None
|
||||
else:
|
||||
return None
|
||||
|
||||
def predict_tags(self, content: str) -> list[int]:
|
||||
from sklearn.utils.multiclass import type_of_target
|
||||
|
||||
if self.tags_classifier:
|
||||
X = self._vectorize(content)
|
||||
y = self.tags_classifier.predict(X)
|
||||
tags_ids = self.tags_binarizer.inverse_transform(y)[0]
|
||||
if type_of_target(y).startswith("multilabel"):
|
||||
# the usual case when there are multiple tags.
|
||||
return list(tags_ids)
|
||||
elif type_of_target(y) == "binary" and tags_ids != -1:
|
||||
# This is for when we have binary classification with only one
|
||||
# tag and the result is to assign this tag.
|
||||
return [tags_ids]
|
||||
else:
|
||||
# Usually binary as well with -1 as the result, but we're
|
||||
# going to catch everything else here as well.
|
||||
return []
|
||||
else:
|
||||
return []
|
||||
|
||||
def predict_storage_path(self, content: str) -> int | None:
|
||||
if self.storage_path_classifier:
|
||||
X = self._vectorize(content)
|
||||
storage_path_id = self.storage_path_classifier.predict(X)
|
||||
if storage_path_id != -1:
|
||||
return storage_path_id
|
||||
else:
|
||||
return None
|
||||
else:
|
||||
return None
|
||||
149
src/documents/conditionals.py
Normal file
149
src/documents/conditionals.py
Normal file
@@ -0,0 +1,149 @@
|
||||
from datetime import datetime
|
||||
from datetime import timezone
|
||||
|
||||
from django.conf import settings
|
||||
from django.core.cache import cache
|
||||
|
||||
from documents.caching import CACHE_5_MINUTES
|
||||
from documents.caching import CACHE_50_MINUTES
|
||||
from documents.caching import CLASSIFIER_HASH_KEY
|
||||
from documents.caching import CLASSIFIER_MODIFIED_KEY
|
||||
from documents.caching import CLASSIFIER_VERSION_KEY
|
||||
from documents.caching import get_thumbnail_modified_key
|
||||
from documents.classifier import DocumentClassifier
|
||||
from documents.models import Document
|
||||
|
||||
|
||||
def suggestions_etag(request, pk: int) -> str | None:
|
||||
"""
|
||||
Returns an optional string for the ETag, allowing browser caching of
|
||||
suggestions if the classifier has not been changed and the suggested dates
|
||||
setting is also unchanged
|
||||
|
||||
"""
|
||||
# If no model file, no etag at all
|
||||
if not settings.MODEL_FILE.exists():
|
||||
return None
|
||||
# Check cache information
|
||||
cache_hits = cache.get_many(
|
||||
[CLASSIFIER_VERSION_KEY, CLASSIFIER_HASH_KEY],
|
||||
)
|
||||
# If the version differs somehow, no etag
|
||||
if (
|
||||
CLASSIFIER_VERSION_KEY in cache_hits
|
||||
and cache_hits[CLASSIFIER_VERSION_KEY] != DocumentClassifier.FORMAT_VERSION
|
||||
):
|
||||
return None
|
||||
elif CLASSIFIER_HASH_KEY in cache_hits:
|
||||
# Refresh the cache and return the hash digest and the dates setting
|
||||
cache.touch(CLASSIFIER_HASH_KEY, CACHE_5_MINUTES)
|
||||
return f"{cache_hits[CLASSIFIER_HASH_KEY]}:{settings.NUMBER_OF_SUGGESTED_DATES}"
|
||||
return None
|
||||
|
||||
|
||||
def suggestions_last_modified(request, pk: int) -> datetime | None:
|
||||
"""
|
||||
Returns the datetime of classifier last modification. This is slightly off,
|
||||
as there is not way to track the suggested date setting modification, but it seems
|
||||
unlikely that changes too often
|
||||
"""
|
||||
# No file, no last modified
|
||||
if not settings.MODEL_FILE.exists():
|
||||
return None
|
||||
cache_hits = cache.get_many(
|
||||
[CLASSIFIER_VERSION_KEY, CLASSIFIER_MODIFIED_KEY],
|
||||
)
|
||||
# If the version differs somehow, no last modified
|
||||
if (
|
||||
CLASSIFIER_VERSION_KEY in cache_hits
|
||||
and cache_hits[CLASSIFIER_VERSION_KEY] != DocumentClassifier.FORMAT_VERSION
|
||||
):
|
||||
return None
|
||||
elif CLASSIFIER_MODIFIED_KEY in cache_hits:
|
||||
# Refresh the cache and return the last modified
|
||||
cache.touch(CLASSIFIER_MODIFIED_KEY, CACHE_5_MINUTES)
|
||||
return cache_hits[CLASSIFIER_MODIFIED_KEY]
|
||||
return None
|
||||
|
||||
|
||||
def metadata_etag(request, pk: int) -> str | None:
|
||||
"""
|
||||
Metadata is extracted from the original file, so use its checksum as the
|
||||
ETag
|
||||
"""
|
||||
try:
|
||||
doc = Document.objects.only("checksum").get(pk=pk)
|
||||
return doc.checksum
|
||||
except Document.DoesNotExist: # pragma: no cover
|
||||
return None
|
||||
return None
|
||||
|
||||
|
||||
def metadata_last_modified(request, pk: int) -> datetime | None:
|
||||
"""
|
||||
Metadata is extracted from the original file, so use its modified. Strictly speaking, this is
|
||||
not the modification of the original file, but of the database object, but might as well
|
||||
error on the side of more cautious
|
||||
"""
|
||||
try:
|
||||
doc = Document.objects.only("modified").get(pk=pk)
|
||||
return doc.modified
|
||||
except Document.DoesNotExist: # pragma: no cover
|
||||
return None
|
||||
return None
|
||||
|
||||
|
||||
def preview_etag(request, pk: int) -> str | None:
|
||||
"""
|
||||
ETag for the document preview, using the original or archive checksum, depending on the request
|
||||
"""
|
||||
try:
|
||||
doc = Document.objects.only("checksum", "archive_checksum").get(pk=pk)
|
||||
use_original = (
|
||||
"original" in request.query_params
|
||||
and request.query_params["original"] == "true"
|
||||
)
|
||||
return doc.checksum if use_original else doc.archive_checksum
|
||||
except Document.DoesNotExist: # pragma: no cover
|
||||
return None
|
||||
return None
|
||||
|
||||
|
||||
def preview_last_modified(request, pk: int) -> datetime | None:
|
||||
"""
|
||||
Uses the documents modified time to set the Last-Modified header. Not strictly
|
||||
speaking correct, but close enough and quick
|
||||
"""
|
||||
try:
|
||||
doc = Document.objects.only("modified").get(pk=pk)
|
||||
return doc.modified
|
||||
except Document.DoesNotExist: # pragma: no cover
|
||||
return None
|
||||
return None
|
||||
|
||||
|
||||
def thumbnail_last_modified(request, pk: int) -> datetime | None:
|
||||
"""
|
||||
Returns the filesystem last modified either from cache or from filesystem.
|
||||
Cache should be (slightly?) faster than filesystem
|
||||
"""
|
||||
try:
|
||||
doc = Document.objects.only("storage_type").get(pk=pk)
|
||||
if not doc.thumbnail_path.exists():
|
||||
return None
|
||||
doc_key = get_thumbnail_modified_key(pk)
|
||||
|
||||
cache_hit = cache.get(doc_key)
|
||||
if cache_hit is not None:
|
||||
cache.touch(doc_key, CACHE_50_MINUTES)
|
||||
return cache_hit
|
||||
|
||||
# No cache, get the timestamp and cache the datetime
|
||||
last_modified = datetime.fromtimestamp(
|
||||
doc.thumbnail_path.stat().st_mtime,
|
||||
tz=timezone.utc,
|
||||
)
|
||||
cache.set(doc_key, last_modified, CACHE_50_MINUTES)
|
||||
return last_modified
|
||||
except Document.DoesNotExist: # pragma: no cover
|
||||
return None
|
||||
860
src/documents/consumer.py
Normal file
860
src/documents/consumer.py
Normal file
@@ -0,0 +1,860 @@
|
||||
import datetime
|
||||
import hashlib
|
||||
import os
|
||||
import tempfile
|
||||
from enum import Enum
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import magic
|
||||
from django.conf import settings
|
||||
from django.contrib.auth.models import User
|
||||
from django.db import transaction
|
||||
from django.db.models import Q
|
||||
from django.utils import timezone
|
||||
from filelock import FileLock
|
||||
from rest_framework.reverse import reverse
|
||||
|
||||
from documents.classifier import load_classifier
|
||||
from documents.data_models import ConsumableDocument
|
||||
from documents.data_models import DocumentMetadataOverrides
|
||||
from documents.file_handling import create_source_path_directory
|
||||
from documents.file_handling import generate_unique_filename
|
||||
from documents.loggers import LoggingMixin
|
||||
from documents.models import Correspondent
|
||||
from documents.models import CustomField
|
||||
from documents.models import CustomFieldInstance
|
||||
from documents.models import Document
|
||||
from documents.models import DocumentType
|
||||
from documents.models import StoragePath
|
||||
from documents.models import Tag
|
||||
from documents.models import WorkflowTrigger
|
||||
from documents.parsers import DocumentParser
|
||||
from documents.parsers import ParseError
|
||||
from documents.parsers import get_parser_class_for_mime_type
|
||||
from documents.parsers import parse_date
|
||||
from documents.permissions import set_permissions_for_object
|
||||
from documents.plugins.base import AlwaysRunPluginMixin
|
||||
from documents.plugins.base import ConsumeTaskPlugin
|
||||
from documents.plugins.base import NoCleanupPluginMixin
|
||||
from documents.plugins.base import NoSetupPluginMixin
|
||||
from documents.plugins.helpers import ProgressManager
|
||||
from documents.plugins.helpers import ProgressStatusOptions
|
||||
from documents.signals import document_consumption_finished
|
||||
from documents.signals import document_consumption_started
|
||||
from documents.signals.handlers import run_workflows
|
||||
from documents.templating.workflows import parse_w_workflow_placeholders
|
||||
from documents.utils import copy_basic_file_stats
|
||||
from documents.utils import copy_file_with_basic_stats
|
||||
from documents.utils import run_subprocess
|
||||
from paperless_mail.parsers import MailDocumentParser
|
||||
|
||||
|
||||
class WorkflowTriggerPlugin(
|
||||
NoCleanupPluginMixin,
|
||||
NoSetupPluginMixin,
|
||||
AlwaysRunPluginMixin,
|
||||
ConsumeTaskPlugin,
|
||||
):
|
||||
NAME: str = "WorkflowTriggerPlugin"
|
||||
|
||||
def run(self) -> str | None:
|
||||
"""
|
||||
Get overrides from matching workflows
|
||||
"""
|
||||
overrides, msg = run_workflows(
|
||||
trigger_type=WorkflowTrigger.WorkflowTriggerType.CONSUMPTION,
|
||||
document=self.input_doc,
|
||||
logging_group=None,
|
||||
overrides=DocumentMetadataOverrides(),
|
||||
)
|
||||
if overrides:
|
||||
self.metadata.update(overrides)
|
||||
return msg
|
||||
|
||||
|
||||
class ConsumerError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class ConsumerStatusShortMessage(str, Enum):
|
||||
DOCUMENT_ALREADY_EXISTS = "document_already_exists"
|
||||
DOCUMENT_ALREADY_EXISTS_IN_TRASH = "document_already_exists_in_trash"
|
||||
ASN_ALREADY_EXISTS = "asn_already_exists"
|
||||
ASN_ALREADY_EXISTS_IN_TRASH = "asn_already_exists_in_trash"
|
||||
ASN_RANGE = "asn_value_out_of_range"
|
||||
FILE_NOT_FOUND = "file_not_found"
|
||||
PRE_CONSUME_SCRIPT_NOT_FOUND = "pre_consume_script_not_found"
|
||||
PRE_CONSUME_SCRIPT_ERROR = "pre_consume_script_error"
|
||||
POST_CONSUME_SCRIPT_NOT_FOUND = "post_consume_script_not_found"
|
||||
POST_CONSUME_SCRIPT_ERROR = "post_consume_script_error"
|
||||
NEW_FILE = "new_file"
|
||||
UNSUPPORTED_TYPE = "unsupported_type"
|
||||
PARSING_DOCUMENT = "parsing_document"
|
||||
GENERATING_THUMBNAIL = "generating_thumbnail"
|
||||
PARSE_DATE = "parse_date"
|
||||
SAVE_DOCUMENT = "save_document"
|
||||
FINISHED = "finished"
|
||||
FAILED = "failed"
|
||||
|
||||
|
||||
class ConsumerPluginMixin:
|
||||
def __init__(
|
||||
self,
|
||||
input_doc: ConsumableDocument,
|
||||
metadata: DocumentMetadataOverrides,
|
||||
status_mgr: ProgressManager,
|
||||
base_tmp_dir: Path,
|
||||
task_id: str,
|
||||
) -> None:
|
||||
super().__init__(input_doc, metadata, status_mgr, base_tmp_dir, task_id)
|
||||
|
||||
self.renew_logging_group()
|
||||
|
||||
self.filename = self.metadata.filename or self.input_doc.original_file.name
|
||||
|
||||
def _send_progress(
|
||||
self,
|
||||
current_progress: int,
|
||||
max_progress: int,
|
||||
status: ProgressStatusOptions,
|
||||
message: ConsumerStatusShortMessage | str | None = None,
|
||||
document_id=None,
|
||||
): # pragma: no cover
|
||||
self.status_mgr.send_progress(
|
||||
status,
|
||||
message,
|
||||
current_progress,
|
||||
max_progress,
|
||||
extra_args={
|
||||
"document_id": document_id,
|
||||
"owner_id": self.metadata.owner_id if self.metadata.owner_id else None,
|
||||
"users_can_view": (self.metadata.view_users or [])
|
||||
+ (self.metadata.change_users or []),
|
||||
"groups_can_view": (self.metadata.view_groups or [])
|
||||
+ (self.metadata.change_groups or []),
|
||||
},
|
||||
)
|
||||
|
||||
def _fail(
|
||||
self,
|
||||
message: ConsumerStatusShortMessage | str,
|
||||
log_message: str | None = None,
|
||||
exc_info=None,
|
||||
exception: Exception | None = None,
|
||||
):
|
||||
self._send_progress(100, 100, ProgressStatusOptions.FAILED, message)
|
||||
self.log.error(log_message or message, exc_info=exc_info)
|
||||
raise ConsumerError(f"{self.filename}: {log_message or message}") from exception
|
||||
|
||||
|
||||
class ConsumerPlugin(
|
||||
AlwaysRunPluginMixin,
|
||||
NoSetupPluginMixin,
|
||||
NoCleanupPluginMixin,
|
||||
LoggingMixin,
|
||||
ConsumerPluginMixin,
|
||||
ConsumeTaskPlugin,
|
||||
):
|
||||
logging_name = "paperless.consumer"
|
||||
|
||||
def run_pre_consume_script(self):
|
||||
"""
|
||||
If one is configured and exists, run the pre-consume script and
|
||||
handle its output and/or errors
|
||||
"""
|
||||
if not settings.PRE_CONSUME_SCRIPT:
|
||||
return
|
||||
|
||||
if not Path(settings.PRE_CONSUME_SCRIPT).is_file():
|
||||
self._fail(
|
||||
ConsumerStatusShortMessage.PRE_CONSUME_SCRIPT_NOT_FOUND,
|
||||
f"Configured pre-consume script "
|
||||
f"{settings.PRE_CONSUME_SCRIPT} does not exist.",
|
||||
)
|
||||
|
||||
self.log.info(f"Executing pre-consume script {settings.PRE_CONSUME_SCRIPT}")
|
||||
|
||||
working_file_path = str(self.working_copy)
|
||||
original_file_path = str(self.input_doc.original_file)
|
||||
|
||||
script_env = os.environ.copy()
|
||||
script_env["DOCUMENT_SOURCE_PATH"] = original_file_path
|
||||
script_env["DOCUMENT_WORKING_PATH"] = working_file_path
|
||||
script_env["TASK_ID"] = self.task_id or ""
|
||||
|
||||
try:
|
||||
run_subprocess(
|
||||
[
|
||||
settings.PRE_CONSUME_SCRIPT,
|
||||
original_file_path,
|
||||
],
|
||||
script_env,
|
||||
self.log,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
self._fail(
|
||||
ConsumerStatusShortMessage.PRE_CONSUME_SCRIPT_ERROR,
|
||||
f"Error while executing pre-consume script: {e}",
|
||||
exc_info=True,
|
||||
exception=e,
|
||||
)
|
||||
|
||||
def run_post_consume_script(self, document: Document):
|
||||
"""
|
||||
If one is configured and exists, run the pre-consume script and
|
||||
handle its output and/or errors
|
||||
"""
|
||||
if not settings.POST_CONSUME_SCRIPT:
|
||||
return
|
||||
|
||||
if not Path(settings.POST_CONSUME_SCRIPT).is_file():
|
||||
self._fail(
|
||||
ConsumerStatusShortMessage.POST_CONSUME_SCRIPT_NOT_FOUND,
|
||||
f"Configured post-consume script "
|
||||
f"{settings.POST_CONSUME_SCRIPT} does not exist.",
|
||||
)
|
||||
|
||||
self.log.info(
|
||||
f"Executing post-consume script {settings.POST_CONSUME_SCRIPT}",
|
||||
)
|
||||
|
||||
script_env = os.environ.copy()
|
||||
|
||||
script_env["DOCUMENT_ID"] = str(document.pk)
|
||||
script_env["DOCUMENT_TYPE"] = str(document.document_type)
|
||||
script_env["DOCUMENT_CREATED"] = str(document.created)
|
||||
script_env["DOCUMENT_MODIFIED"] = str(document.modified)
|
||||
script_env["DOCUMENT_ADDED"] = str(document.added)
|
||||
script_env["DOCUMENT_FILE_NAME"] = document.get_public_filename()
|
||||
script_env["DOCUMENT_SOURCE_PATH"] = os.path.normpath(document.source_path)
|
||||
script_env["DOCUMENT_ARCHIVE_PATH"] = os.path.normpath(
|
||||
str(document.archive_path),
|
||||
)
|
||||
script_env["DOCUMENT_THUMBNAIL_PATH"] = os.path.normpath(
|
||||
document.thumbnail_path,
|
||||
)
|
||||
script_env["DOCUMENT_DOWNLOAD_URL"] = reverse(
|
||||
"document-download",
|
||||
kwargs={"pk": document.pk},
|
||||
)
|
||||
script_env["DOCUMENT_THUMBNAIL_URL"] = reverse(
|
||||
"document-thumb",
|
||||
kwargs={"pk": document.pk},
|
||||
)
|
||||
script_env["DOCUMENT_OWNER"] = (
|
||||
document.owner.get_username() if document.owner else ""
|
||||
)
|
||||
script_env["DOCUMENT_CORRESPONDENT"] = str(document.correspondent)
|
||||
script_env["DOCUMENT_TAGS"] = str(
|
||||
",".join(document.tags.all().values_list("name", flat=True)),
|
||||
)
|
||||
script_env["DOCUMENT_ORIGINAL_FILENAME"] = str(document.original_filename)
|
||||
script_env["TASK_ID"] = self.task_id or ""
|
||||
|
||||
try:
|
||||
run_subprocess(
|
||||
[
|
||||
settings.POST_CONSUME_SCRIPT,
|
||||
str(document.pk),
|
||||
document.get_public_filename(),
|
||||
os.path.normpath(document.source_path),
|
||||
os.path.normpath(document.thumbnail_path),
|
||||
reverse("document-download", kwargs={"pk": document.pk}),
|
||||
reverse("document-thumb", kwargs={"pk": document.pk}),
|
||||
str(document.correspondent),
|
||||
str(",".join(document.tags.all().values_list("name", flat=True))),
|
||||
],
|
||||
script_env,
|
||||
self.log,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
self._fail(
|
||||
ConsumerStatusShortMessage.POST_CONSUME_SCRIPT_ERROR,
|
||||
f"Error while executing post-consume script: {e}",
|
||||
exc_info=True,
|
||||
exception=e,
|
||||
)
|
||||
|
||||
def run(self) -> str:
|
||||
"""
|
||||
Return the document object if it was successfully created.
|
||||
"""
|
||||
|
||||
tempdir = None
|
||||
|
||||
try:
|
||||
# Preflight has already run including progress update to 0%
|
||||
self.log.info(f"Consuming {self.filename}")
|
||||
|
||||
# For the actual work, copy the file into a tempdir
|
||||
tempdir = tempfile.TemporaryDirectory(
|
||||
prefix="paperless-ngx",
|
||||
dir=settings.SCRATCH_DIR,
|
||||
)
|
||||
self.working_copy = Path(tempdir.name) / Path(self.filename)
|
||||
copy_file_with_basic_stats(self.input_doc.original_file, self.working_copy)
|
||||
self.unmodified_original = None
|
||||
|
||||
# Determine the parser class.
|
||||
|
||||
mime_type = magic.from_file(self.working_copy, mime=True)
|
||||
|
||||
self.log.debug(f"Detected mime type: {mime_type}")
|
||||
|
||||
if (
|
||||
Path(self.filename).suffix.lower() == ".pdf"
|
||||
and mime_type in settings.CONSUMER_PDF_RECOVERABLE_MIME_TYPES
|
||||
):
|
||||
try:
|
||||
# The file might be a pdf, but the mime type is wrong.
|
||||
# Try to clean with qpdf
|
||||
self.log.debug(
|
||||
"Detected possible PDF with wrong mime type, trying to clean with qpdf",
|
||||
)
|
||||
run_subprocess(
|
||||
[
|
||||
"qpdf",
|
||||
"--replace-input",
|
||||
self.working_copy,
|
||||
],
|
||||
logger=self.log,
|
||||
)
|
||||
mime_type = magic.from_file(self.working_copy, mime=True)
|
||||
self.log.debug(f"Detected mime type after qpdf: {mime_type}")
|
||||
# Save the original file for later
|
||||
self.unmodified_original = (
|
||||
Path(tempdir.name) / Path("uo") / Path(self.filename)
|
||||
)
|
||||
self.unmodified_original.parent.mkdir(exist_ok=True)
|
||||
copy_file_with_basic_stats(
|
||||
self.input_doc.original_file,
|
||||
self.unmodified_original,
|
||||
)
|
||||
except Exception as e:
|
||||
self.log.error(f"Error attempting to clean PDF: {e}")
|
||||
|
||||
# Based on the mime type, get the parser for that type
|
||||
parser_class: type[DocumentParser] | None = get_parser_class_for_mime_type(
|
||||
mime_type,
|
||||
)
|
||||
if not parser_class:
|
||||
tempdir.cleanup()
|
||||
self._fail(
|
||||
ConsumerStatusShortMessage.UNSUPPORTED_TYPE,
|
||||
f"Unsupported mime type {mime_type}",
|
||||
)
|
||||
|
||||
# Notify all listeners that we're going to do some work.
|
||||
|
||||
document_consumption_started.send(
|
||||
sender=self.__class__,
|
||||
filename=self.working_copy,
|
||||
logging_group=self.logging_group,
|
||||
)
|
||||
|
||||
self.run_pre_consume_script()
|
||||
except:
|
||||
if tempdir:
|
||||
tempdir.cleanup()
|
||||
raise
|
||||
|
||||
def progress_callback(current_progress, max_progress): # pragma: no cover
|
||||
# recalculate progress to be within 20 and 80
|
||||
p = int((current_progress / max_progress) * 50 + 20)
|
||||
self._send_progress(p, 100, ProgressStatusOptions.WORKING)
|
||||
|
||||
# This doesn't parse the document yet, but gives us a parser.
|
||||
|
||||
document_parser: DocumentParser = parser_class(
|
||||
self.logging_group,
|
||||
progress_callback=progress_callback,
|
||||
)
|
||||
|
||||
self.log.debug(f"Parser: {type(document_parser).__name__}")
|
||||
|
||||
# Parse the document. This may take some time.
|
||||
|
||||
text = None
|
||||
date = None
|
||||
thumbnail = None
|
||||
archive_path = None
|
||||
page_count = None
|
||||
|
||||
try:
|
||||
self._send_progress(
|
||||
20,
|
||||
100,
|
||||
ProgressStatusOptions.WORKING,
|
||||
ConsumerStatusShortMessage.PARSING_DOCUMENT,
|
||||
)
|
||||
self.log.debug(f"Parsing {self.filename}...")
|
||||
if (
|
||||
isinstance(document_parser, MailDocumentParser)
|
||||
and self.input_doc.mailrule_id
|
||||
):
|
||||
document_parser.parse(
|
||||
self.working_copy,
|
||||
mime_type,
|
||||
self.filename,
|
||||
self.input_doc.mailrule_id,
|
||||
)
|
||||
else:
|
||||
document_parser.parse(self.working_copy, mime_type, self.filename)
|
||||
|
||||
self.log.debug(f"Generating thumbnail for {self.filename}...")
|
||||
self._send_progress(
|
||||
70,
|
||||
100,
|
||||
ProgressStatusOptions.WORKING,
|
||||
ConsumerStatusShortMessage.GENERATING_THUMBNAIL,
|
||||
)
|
||||
thumbnail = document_parser.get_thumbnail(
|
||||
self.working_copy,
|
||||
mime_type,
|
||||
self.filename,
|
||||
)
|
||||
|
||||
text = document_parser.get_text()
|
||||
date = document_parser.get_date()
|
||||
if date is None:
|
||||
self._send_progress(
|
||||
90,
|
||||
100,
|
||||
ProgressStatusOptions.WORKING,
|
||||
ConsumerStatusShortMessage.PARSE_DATE,
|
||||
)
|
||||
date = parse_date(self.filename, text)
|
||||
archive_path = document_parser.get_archive_path()
|
||||
page_count = document_parser.get_page_count(self.working_copy, mime_type)
|
||||
|
||||
except ParseError as e:
|
||||
document_parser.cleanup()
|
||||
if tempdir:
|
||||
tempdir.cleanup()
|
||||
self._fail(
|
||||
str(e),
|
||||
f"Error occurred while consuming document {self.filename}: {e}",
|
||||
exc_info=True,
|
||||
exception=e,
|
||||
)
|
||||
except Exception as e:
|
||||
document_parser.cleanup()
|
||||
if tempdir:
|
||||
tempdir.cleanup()
|
||||
self._fail(
|
||||
str(e),
|
||||
f"Unexpected error while consuming document {self.filename}: {e}",
|
||||
exc_info=True,
|
||||
exception=e,
|
||||
)
|
||||
|
||||
# Prepare the document classifier.
|
||||
|
||||
# TODO: I don't really like to do this here, but this way we avoid
|
||||
# reloading the classifier multiple times, since there are multiple
|
||||
# post-consume hooks that all require the classifier.
|
||||
|
||||
classifier = load_classifier()
|
||||
|
||||
self._send_progress(
|
||||
95,
|
||||
100,
|
||||
ProgressStatusOptions.WORKING,
|
||||
ConsumerStatusShortMessage.SAVE_DOCUMENT,
|
||||
)
|
||||
# now that everything is done, we can start to store the document
|
||||
# in the system. This will be a transaction and reasonably fast.
|
||||
try:
|
||||
with transaction.atomic():
|
||||
# store the document.
|
||||
document = self._store(
|
||||
text=text,
|
||||
date=date,
|
||||
page_count=page_count,
|
||||
mime_type=mime_type,
|
||||
)
|
||||
|
||||
# If we get here, it was successful. Proceed with post-consume
|
||||
# hooks. If they fail, nothing will get changed.
|
||||
|
||||
document_consumption_finished.send(
|
||||
sender=self.__class__,
|
||||
document=document,
|
||||
logging_group=self.logging_group,
|
||||
classifier=classifier,
|
||||
original_file=self.unmodified_original
|
||||
if self.unmodified_original
|
||||
else self.working_copy,
|
||||
)
|
||||
|
||||
# After everything is in the database, copy the files into
|
||||
# place. If this fails, we'll also rollback the transaction.
|
||||
with FileLock(settings.MEDIA_LOCK):
|
||||
document.filename = generate_unique_filename(document)
|
||||
create_source_path_directory(document.source_path)
|
||||
|
||||
self._write(
|
||||
document.storage_type,
|
||||
self.unmodified_original
|
||||
if self.unmodified_original is not None
|
||||
else self.working_copy,
|
||||
document.source_path,
|
||||
)
|
||||
|
||||
self._write(
|
||||
document.storage_type,
|
||||
thumbnail,
|
||||
document.thumbnail_path,
|
||||
)
|
||||
|
||||
if archive_path and Path(archive_path).is_file():
|
||||
document.archive_filename = generate_unique_filename(
|
||||
document,
|
||||
archive_filename=True,
|
||||
)
|
||||
create_source_path_directory(document.archive_path)
|
||||
self._write(
|
||||
document.storage_type,
|
||||
archive_path,
|
||||
document.archive_path,
|
||||
)
|
||||
|
||||
with Path(archive_path).open("rb") as f:
|
||||
document.archive_checksum = hashlib.md5(
|
||||
f.read(),
|
||||
).hexdigest()
|
||||
|
||||
# Don't save with the lock active. Saving will cause the file
|
||||
# renaming logic to acquire the lock as well.
|
||||
# This triggers things like file renaming
|
||||
document.save()
|
||||
|
||||
# Delete the file only if it was successfully consumed
|
||||
self.log.debug(f"Deleting original file {self.input_doc.original_file}")
|
||||
self.input_doc.original_file.unlink()
|
||||
self.log.debug(f"Deleting working copy {self.working_copy}")
|
||||
self.working_copy.unlink()
|
||||
if self.unmodified_original is not None: # pragma: no cover
|
||||
self.log.debug(
|
||||
f"Deleting unmodified original file {self.unmodified_original}",
|
||||
)
|
||||
self.unmodified_original.unlink()
|
||||
|
||||
# https://github.com/jonaswinkler/paperless-ng/discussions/1037
|
||||
shadow_file = (
|
||||
Path(self.input_doc.original_file).parent
|
||||
/ f"._{Path(self.input_doc.original_file).name}"
|
||||
)
|
||||
|
||||
if Path(shadow_file).is_file():
|
||||
self.log.debug(f"Deleting shadow file {shadow_file}")
|
||||
Path(shadow_file).unlink()
|
||||
|
||||
except Exception as e:
|
||||
self._fail(
|
||||
str(e),
|
||||
f"The following error occurred while storing document "
|
||||
f"{self.filename} after parsing: {e}",
|
||||
exc_info=True,
|
||||
exception=e,
|
||||
)
|
||||
finally:
|
||||
document_parser.cleanup()
|
||||
tempdir.cleanup()
|
||||
|
||||
self.run_post_consume_script(document)
|
||||
|
||||
self.log.info(f"Document {document} consumption finished")
|
||||
|
||||
self._send_progress(
|
||||
100,
|
||||
100,
|
||||
ProgressStatusOptions.SUCCESS,
|
||||
ConsumerStatusShortMessage.FINISHED,
|
||||
document.id,
|
||||
)
|
||||
|
||||
# Return the most up to date fields
|
||||
document.refresh_from_db()
|
||||
|
||||
return f"Success. New document id {document.pk} created"
|
||||
|
||||
def _parse_title_placeholders(self, title: str) -> str:
|
||||
local_added = timezone.localtime(timezone.now())
|
||||
|
||||
correspondent_name = (
|
||||
Correspondent.objects.get(pk=self.metadata.correspondent_id).name
|
||||
if self.metadata.correspondent_id is not None
|
||||
else None
|
||||
)
|
||||
doc_type_name = (
|
||||
DocumentType.objects.get(pk=self.metadata.document_type_id).name
|
||||
if self.metadata.document_type_id is not None
|
||||
else None
|
||||
)
|
||||
owner_username = (
|
||||
User.objects.get(pk=self.metadata.owner_id).username
|
||||
if self.metadata.owner_id is not None
|
||||
else None
|
||||
)
|
||||
|
||||
return parse_w_workflow_placeholders(
|
||||
title,
|
||||
correspondent_name,
|
||||
doc_type_name,
|
||||
owner_username,
|
||||
local_added,
|
||||
self.filename,
|
||||
self.filename,
|
||||
)
|
||||
|
||||
def _store(
|
||||
self,
|
||||
text: str,
|
||||
date: datetime.datetime | None,
|
||||
page_count: int | None,
|
||||
mime_type: str,
|
||||
) -> Document:
|
||||
# If someone gave us the original filename, use it instead of doc.
|
||||
|
||||
self.log.debug("Saving record to database")
|
||||
|
||||
if self.metadata.created is not None:
|
||||
create_date = self.metadata.created
|
||||
self.log.debug(
|
||||
f"Creation date from post_documents parameter: {create_date}",
|
||||
)
|
||||
elif date is not None:
|
||||
create_date = date
|
||||
self.log.debug(f"Creation date from parse_date: {create_date}")
|
||||
else:
|
||||
stats = Path(self.input_doc.original_file).stat()
|
||||
create_date = timezone.make_aware(
|
||||
datetime.datetime.fromtimestamp(stats.st_mtime),
|
||||
)
|
||||
self.log.debug(f"Creation date from st_mtime: {create_date}")
|
||||
|
||||
storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
|
||||
if self.metadata.filename:
|
||||
title = Path(self.metadata.filename).stem
|
||||
else:
|
||||
title = self.input_doc.original_file.stem
|
||||
|
||||
if self.metadata.title is not None:
|
||||
try:
|
||||
title = self._parse_title_placeholders(self.metadata.title)
|
||||
except Exception as e:
|
||||
self.log.error(
|
||||
f"Error occurred parsing title override '{self.metadata.title}', falling back to original. Exception: {e}",
|
||||
)
|
||||
|
||||
file_for_checksum = (
|
||||
self.unmodified_original
|
||||
if self.unmodified_original is not None
|
||||
else self.working_copy
|
||||
)
|
||||
|
||||
document = Document.objects.create(
|
||||
title=title[:127],
|
||||
content=text,
|
||||
mime_type=mime_type,
|
||||
checksum=hashlib.md5(file_for_checksum.read_bytes()).hexdigest(),
|
||||
created=create_date,
|
||||
modified=create_date,
|
||||
storage_type=storage_type,
|
||||
page_count=page_count,
|
||||
original_filename=self.filename,
|
||||
)
|
||||
|
||||
self.apply_overrides(document)
|
||||
|
||||
document.save()
|
||||
|
||||
return document
|
||||
|
||||
def apply_overrides(self, document):
|
||||
if self.metadata.correspondent_id:
|
||||
document.correspondent = Correspondent.objects.get(
|
||||
pk=self.metadata.correspondent_id,
|
||||
)
|
||||
|
||||
if self.metadata.document_type_id:
|
||||
document.document_type = DocumentType.objects.get(
|
||||
pk=self.metadata.document_type_id,
|
||||
)
|
||||
|
||||
if self.metadata.tag_ids:
|
||||
for tag_id in self.metadata.tag_ids:
|
||||
document.add_nested_tags([Tag.objects.get(pk=tag_id)])
|
||||
|
||||
if self.metadata.storage_path_id:
|
||||
document.storage_path = StoragePath.objects.get(
|
||||
pk=self.metadata.storage_path_id,
|
||||
)
|
||||
|
||||
if self.metadata.asn is not None:
|
||||
document.archive_serial_number = self.metadata.asn
|
||||
|
||||
if self.metadata.owner_id:
|
||||
document.owner = User.objects.get(
|
||||
pk=self.metadata.owner_id,
|
||||
)
|
||||
|
||||
if (
|
||||
self.metadata.view_users is not None
|
||||
or self.metadata.view_groups is not None
|
||||
or self.metadata.change_users is not None
|
||||
or self.metadata.change_users is not None
|
||||
):
|
||||
permissions = {
|
||||
"view": {
|
||||
"users": self.metadata.view_users or [],
|
||||
"groups": self.metadata.view_groups or [],
|
||||
},
|
||||
"change": {
|
||||
"users": self.metadata.change_users or [],
|
||||
"groups": self.metadata.change_groups or [],
|
||||
},
|
||||
}
|
||||
set_permissions_for_object(permissions=permissions, object=document)
|
||||
|
||||
if self.metadata.custom_fields:
|
||||
for field in CustomField.objects.filter(
|
||||
id__in=self.metadata.custom_fields.keys(),
|
||||
).distinct():
|
||||
value_field_name = CustomFieldInstance.get_value_field_name(
|
||||
data_type=field.data_type,
|
||||
)
|
||||
args = {
|
||||
"field": field,
|
||||
"document": document,
|
||||
value_field_name: self.metadata.custom_fields.get(field.id, None),
|
||||
}
|
||||
CustomFieldInstance.objects.create(**args) # adds to document
|
||||
|
||||
def _write(self, storage_type, source, target):
|
||||
with (
|
||||
Path(source).open("rb") as read_file,
|
||||
Path(target).open("wb") as write_file,
|
||||
):
|
||||
write_file.write(read_file.read())
|
||||
|
||||
# Attempt to copy file's original stats, but it's ok if we can't
|
||||
try:
|
||||
copy_basic_file_stats(source, target)
|
||||
except Exception: # pragma: no cover
|
||||
pass
|
||||
|
||||
|
||||
class ConsumerPreflightPlugin(
|
||||
NoCleanupPluginMixin,
|
||||
NoSetupPluginMixin,
|
||||
AlwaysRunPluginMixin,
|
||||
LoggingMixin,
|
||||
ConsumerPluginMixin,
|
||||
ConsumeTaskPlugin,
|
||||
):
|
||||
NAME: str = "ConsumerPreflightPlugin"
|
||||
logging_name = "paperless.consumer"
|
||||
|
||||
def pre_check_file_exists(self):
|
||||
"""
|
||||
Confirm the input file still exists where it should
|
||||
"""
|
||||
if TYPE_CHECKING:
|
||||
assert isinstance(self.input_doc.original_file, Path), (
|
||||
self.input_doc.original_file
|
||||
)
|
||||
if not self.input_doc.original_file.is_file():
|
||||
self._fail(
|
||||
ConsumerStatusShortMessage.FILE_NOT_FOUND,
|
||||
f"Cannot consume {self.input_doc.original_file}: File not found.",
|
||||
)
|
||||
|
||||
def pre_check_duplicate(self):
|
||||
"""
|
||||
Using the MD5 of the file, check this exact file doesn't already exist
|
||||
"""
|
||||
with Path(self.input_doc.original_file).open("rb") as f:
|
||||
checksum = hashlib.md5(f.read()).hexdigest()
|
||||
existing_doc = Document.global_objects.filter(
|
||||
Q(checksum=checksum) | Q(archive_checksum=checksum),
|
||||
)
|
||||
if existing_doc.exists():
|
||||
msg = ConsumerStatusShortMessage.DOCUMENT_ALREADY_EXISTS
|
||||
log_msg = f"Not consuming {self.filename}: It is a duplicate of {existing_doc.get().title} (#{existing_doc.get().pk})."
|
||||
|
||||
if existing_doc.first().deleted_at is not None:
|
||||
msg = ConsumerStatusShortMessage.DOCUMENT_ALREADY_EXISTS_IN_TRASH
|
||||
log_msg += " Note: existing document is in the trash."
|
||||
|
||||
if settings.CONSUMER_DELETE_DUPLICATES:
|
||||
Path(self.input_doc.original_file).unlink()
|
||||
self._fail(
|
||||
msg,
|
||||
log_msg,
|
||||
)
|
||||
|
||||
def pre_check_directories(self):
|
||||
"""
|
||||
Ensure all required directories exist before attempting to use them
|
||||
"""
|
||||
settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True)
|
||||
settings.THUMBNAIL_DIR.mkdir(parents=True, exist_ok=True)
|
||||
settings.ORIGINALS_DIR.mkdir(parents=True, exist_ok=True)
|
||||
settings.ARCHIVE_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
def pre_check_asn_value(self):
|
||||
"""
|
||||
Check that if override_asn is given, it is unique and within a valid range
|
||||
"""
|
||||
if self.metadata.asn is None:
|
||||
# check not necessary in case no ASN gets set
|
||||
return
|
||||
# Validate the range is above zero and less than uint32_t max
|
||||
# otherwise, Whoosh can't handle it in the index
|
||||
if (
|
||||
self.metadata.asn < Document.ARCHIVE_SERIAL_NUMBER_MIN
|
||||
or self.metadata.asn > Document.ARCHIVE_SERIAL_NUMBER_MAX
|
||||
):
|
||||
self._fail(
|
||||
ConsumerStatusShortMessage.ASN_RANGE,
|
||||
f"Not consuming {self.filename}: "
|
||||
f"Given ASN {self.metadata.asn} is out of range "
|
||||
f"[{Document.ARCHIVE_SERIAL_NUMBER_MIN:,}, "
|
||||
f"{Document.ARCHIVE_SERIAL_NUMBER_MAX:,}]",
|
||||
)
|
||||
existing_asn_doc = Document.global_objects.filter(
|
||||
archive_serial_number=self.metadata.asn,
|
||||
)
|
||||
if existing_asn_doc.exists():
|
||||
msg = ConsumerStatusShortMessage.ASN_ALREADY_EXISTS
|
||||
log_msg = f"Not consuming {self.filename}: Given ASN {self.metadata.asn} already exists!"
|
||||
|
||||
if existing_asn_doc.first().deleted_at is not None:
|
||||
msg = ConsumerStatusShortMessage.ASN_ALREADY_EXISTS_IN_TRASH
|
||||
log_msg += " Note: existing document is in the trash."
|
||||
|
||||
self._fail(
|
||||
msg,
|
||||
log_msg,
|
||||
)
|
||||
|
||||
def run(self) -> None:
|
||||
self._send_progress(
|
||||
0,
|
||||
100,
|
||||
ProgressStatusOptions.STARTED,
|
||||
ConsumerStatusShortMessage.NEW_FILE,
|
||||
)
|
||||
|
||||
# Make sure that preconditions for consuming the file are met.
|
||||
|
||||
self.pre_check_file_exists()
|
||||
self.pre_check_duplicate()
|
||||
self.pre_check_directories()
|
||||
self.pre_check_asn_value()
|
||||
35
src/documents/context_processors.py
Normal file
35
src/documents/context_processors.py
Normal file
@@ -0,0 +1,35 @@
|
||||
from django.conf import settings as django_settings
|
||||
from django.contrib.auth.models import User
|
||||
|
||||
from documents.models import Document
|
||||
from paperless.config import GeneralConfig
|
||||
|
||||
|
||||
def settings(request):
|
||||
general_config = GeneralConfig()
|
||||
|
||||
app_title = (
|
||||
django_settings.APP_TITLE
|
||||
if general_config.app_title is None or len(general_config.app_title) == 0
|
||||
else general_config.app_title
|
||||
)
|
||||
app_logo = (
|
||||
django_settings.APP_LOGO
|
||||
if general_config.app_logo is None or len(general_config.app_logo) == 0
|
||||
else django_settings.BASE_URL + general_config.app_logo.lstrip("/")
|
||||
)
|
||||
|
||||
return {
|
||||
"EMAIL_ENABLED": django_settings.EMAIL_ENABLED,
|
||||
"DISABLE_REGULAR_LOGIN": django_settings.DISABLE_REGULAR_LOGIN,
|
||||
"REDIRECT_LOGIN_TO_SSO": django_settings.REDIRECT_LOGIN_TO_SSO,
|
||||
"ACCOUNT_ALLOW_SIGNUPS": django_settings.ACCOUNT_ALLOW_SIGNUPS,
|
||||
"domain": getattr(django_settings, "PAPERLESS_URL", request.get_host()),
|
||||
"APP_TITLE": app_title,
|
||||
"APP_LOGO": app_logo,
|
||||
"FIRST_INSTALL": User.objects.exclude(
|
||||
username__in=["consumer", "AnonymousUser"],
|
||||
).count()
|
||||
== 0
|
||||
and Document.global_objects.count() == 0,
|
||||
}
|
||||
50
src/documents/converters.py
Normal file
50
src/documents/converters.py
Normal file
@@ -0,0 +1,50 @@
|
||||
from pathlib import Path
|
||||
|
||||
import img2pdf
|
||||
from django.conf import settings
|
||||
from PIL import Image
|
||||
|
||||
from documents.utils import copy_basic_file_stats
|
||||
from documents.utils import maybe_override_pixel_limit
|
||||
from documents.utils import run_subprocess
|
||||
|
||||
|
||||
def convert_from_tiff_to_pdf(tiff_path: Path, target_directory: Path) -> Path:
|
||||
"""
|
||||
Converts a TIFF file into a PDF file.
|
||||
|
||||
The PDF will be created in the given target_directory and share the name of
|
||||
the original TIFF file, as well as its stats (mtime etc.).
|
||||
|
||||
Returns the path of the PDF created.
|
||||
"""
|
||||
# override pixel setting if needed
|
||||
maybe_override_pixel_limit()
|
||||
|
||||
with Image.open(tiff_path) as im:
|
||||
has_alpha_layer = im.mode in ("RGBA", "LA")
|
||||
if has_alpha_layer:
|
||||
# Note the save into the temp folder, so as not to trigger a new
|
||||
# consume
|
||||
scratch_image = target_directory / tiff_path.name
|
||||
run_subprocess(
|
||||
[
|
||||
settings.CONVERT_BINARY,
|
||||
"-alpha",
|
||||
"off",
|
||||
tiff_path,
|
||||
scratch_image,
|
||||
],
|
||||
)
|
||||
else:
|
||||
# Not modifying the original, safe to use in place
|
||||
scratch_image = tiff_path
|
||||
|
||||
pdf_path = (target_directory / tiff_path.name).with_suffix(".pdf")
|
||||
|
||||
with scratch_image.open("rb") as img_file, pdf_path.open("wb") as pdf_file:
|
||||
pdf_file.write(img2pdf.convert(img_file))
|
||||
|
||||
# Copy what file stat is possible
|
||||
copy_basic_file_stats(tiff_path, pdf_path)
|
||||
return pdf_path
|
||||
175
src/documents/data_models.py
Normal file
175
src/documents/data_models.py
Normal file
@@ -0,0 +1,175 @@
|
||||
import dataclasses
|
||||
import datetime
|
||||
from enum import IntEnum
|
||||
from pathlib import Path
|
||||
|
||||
import magic
|
||||
from guardian.shortcuts import get_groups_with_perms
|
||||
from guardian.shortcuts import get_users_with_perms
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class DocumentMetadataOverrides:
|
||||
"""
|
||||
Manages overrides for document fields which normally would
|
||||
be set from content or matching. All fields default to None,
|
||||
meaning no override is happening
|
||||
"""
|
||||
|
||||
filename: str | None = None
|
||||
title: str | None = None
|
||||
correspondent_id: int | None = None
|
||||
document_type_id: int | None = None
|
||||
tag_ids: list[int] | None = None
|
||||
storage_path_id: int | None = None
|
||||
created: datetime.datetime | None = None
|
||||
asn: int | None = None
|
||||
owner_id: int | None = None
|
||||
view_users: list[int] | None = None
|
||||
view_groups: list[int] | None = None
|
||||
change_users: list[int] | None = None
|
||||
change_groups: list[int] | None = None
|
||||
custom_fields: dict | None = None
|
||||
|
||||
def update(self, other: "DocumentMetadataOverrides") -> "DocumentMetadataOverrides":
|
||||
"""
|
||||
Merges two DocumentMetadataOverrides objects such that object B's overrides
|
||||
are applied to object A or merged if multiple are accepted.
|
||||
|
||||
The update is an in-place modification of self
|
||||
"""
|
||||
# only if empty
|
||||
if other.title is not None:
|
||||
self.title = other.title
|
||||
if other.correspondent_id is not None:
|
||||
self.correspondent_id = other.correspondent_id
|
||||
if other.document_type_id is not None:
|
||||
self.document_type_id = other.document_type_id
|
||||
if other.storage_path_id is not None:
|
||||
self.storage_path_id = other.storage_path_id
|
||||
if other.owner_id is not None:
|
||||
self.owner_id = other.owner_id
|
||||
|
||||
# merge
|
||||
if self.tag_ids is None:
|
||||
self.tag_ids = other.tag_ids
|
||||
elif other.tag_ids is not None:
|
||||
self.tag_ids.extend(other.tag_ids)
|
||||
self.tag_ids = list(set(self.tag_ids))
|
||||
|
||||
if self.view_users is None:
|
||||
self.view_users = other.view_users
|
||||
elif other.view_users is not None:
|
||||
self.view_users.extend(other.view_users)
|
||||
self.view_users = list(set(self.view_users))
|
||||
|
||||
if self.view_groups is None:
|
||||
self.view_groups = other.view_groups
|
||||
elif other.view_groups is not None:
|
||||
self.view_groups.extend(other.view_groups)
|
||||
self.view_groups = list(set(self.view_groups))
|
||||
|
||||
if self.change_users is None:
|
||||
self.change_users = other.change_users
|
||||
elif other.change_users is not None:
|
||||
self.change_users.extend(other.change_users)
|
||||
self.change_users = list(set(self.change_users))
|
||||
|
||||
if self.change_groups is None:
|
||||
self.change_groups = other.change_groups
|
||||
elif other.change_groups is not None:
|
||||
self.change_groups.extend(other.change_groups)
|
||||
self.change_groups = list(set(self.change_groups))
|
||||
|
||||
if self.custom_fields is None:
|
||||
self.custom_fields = other.custom_fields
|
||||
elif other.custom_fields is not None:
|
||||
self.custom_fields.update(other.custom_fields)
|
||||
|
||||
return self
|
||||
|
||||
@staticmethod
|
||||
def from_document(doc) -> "DocumentMetadataOverrides":
|
||||
"""
|
||||
Fills in the overrides from a document object
|
||||
"""
|
||||
overrides = DocumentMetadataOverrides()
|
||||
overrides.title = doc.title
|
||||
overrides.correspondent_id = doc.correspondent.id if doc.correspondent else None
|
||||
overrides.document_type_id = doc.document_type.id if doc.document_type else None
|
||||
overrides.storage_path_id = doc.storage_path.id if doc.storage_path else None
|
||||
overrides.owner_id = doc.owner.id if doc.owner else None
|
||||
overrides.tag_ids = list(doc.tags.values_list("id", flat=True))
|
||||
|
||||
overrides.view_users = list(
|
||||
get_users_with_perms(
|
||||
doc,
|
||||
only_with_perms_in=["view_document"],
|
||||
).values_list("id", flat=True),
|
||||
)
|
||||
overrides.change_users = list(
|
||||
get_users_with_perms(
|
||||
doc,
|
||||
only_with_perms_in=["change_document"],
|
||||
).values_list("id", flat=True),
|
||||
)
|
||||
overrides.custom_fields = {
|
||||
custom_field.id: custom_field.value
|
||||
for custom_field in doc.custom_fields.all()
|
||||
}
|
||||
|
||||
groups_with_perms = get_groups_with_perms(
|
||||
doc,
|
||||
attach_perms=True,
|
||||
)
|
||||
overrides.view_groups = [
|
||||
group.id
|
||||
for group in groups_with_perms
|
||||
if "view_document" in groups_with_perms[group]
|
||||
]
|
||||
overrides.change_groups = [
|
||||
group.id
|
||||
for group in groups_with_perms
|
||||
if "change_document" in groups_with_perms[group]
|
||||
]
|
||||
|
||||
return overrides
|
||||
|
||||
|
||||
class DocumentSource(IntEnum):
|
||||
"""
|
||||
The source of an incoming document. May have other uses in the future
|
||||
"""
|
||||
|
||||
ConsumeFolder = 1
|
||||
ApiUpload = 2
|
||||
MailFetch = 3
|
||||
WebUI = 4
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class ConsumableDocument:
|
||||
"""
|
||||
Encapsulates an incoming document, either from consume folder, API upload
|
||||
or mail fetching and certain useful operations on it.
|
||||
"""
|
||||
|
||||
source: DocumentSource
|
||||
original_file: Path
|
||||
original_path: Path | None = None
|
||||
mailrule_id: int | None = None
|
||||
mime_type: str = dataclasses.field(init=False, default=None)
|
||||
|
||||
def __post_init__(self):
|
||||
"""
|
||||
After a dataclass is initialized, this is called to finalize some data
|
||||
1. Make sure the original path is an absolute, fully qualified path
|
||||
2. Get the mime type of the file
|
||||
"""
|
||||
# Always fully qualify the path first thing
|
||||
# Just in case, convert to a path if it's a str
|
||||
self.original_file = Path(self.original_file).resolve()
|
||||
|
||||
# Get the file type once at init
|
||||
# Note this function isn't called when the object is unpickled
|
||||
self.mime_type = magic.from_file(self.original_file, mime=True)
|
||||
146
src/documents/double_sided.py
Normal file
146
src/documents/double_sided.py
Normal file
@@ -0,0 +1,146 @@
|
||||
import datetime as dt
|
||||
import logging
|
||||
import os
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from typing import Final
|
||||
|
||||
from django.conf import settings
|
||||
from pikepdf import Pdf
|
||||
|
||||
from documents.consumer import ConsumerError
|
||||
from documents.converters import convert_from_tiff_to_pdf
|
||||
from documents.plugins.base import ConsumeTaskPlugin
|
||||
from documents.plugins.base import NoCleanupPluginMixin
|
||||
from documents.plugins.base import NoSetupPluginMixin
|
||||
from documents.plugins.base import StopConsumeTaskError
|
||||
|
||||
logger = logging.getLogger("paperless.double_sided")
|
||||
|
||||
# Hardcoded for now, could be made a configurable setting if needed
|
||||
TIMEOUT_MINUTES: Final[int] = 30
|
||||
TIMEOUT_SECONDS: Final[int] = TIMEOUT_MINUTES * 60
|
||||
|
||||
# Used by test cases
|
||||
STAGING_FILE_NAME = "double-sided-staging.pdf"
|
||||
|
||||
|
||||
class CollatePlugin(NoCleanupPluginMixin, NoSetupPluginMixin, ConsumeTaskPlugin):
|
||||
NAME: str = "CollatePlugin"
|
||||
|
||||
@property
|
||||
def able_to_run(self) -> bool:
|
||||
return (
|
||||
settings.CONSUMER_ENABLE_COLLATE_DOUBLE_SIDED
|
||||
and settings.CONSUMER_COLLATE_DOUBLE_SIDED_SUBDIR_NAME
|
||||
in self.input_doc.original_file.parts
|
||||
)
|
||||
|
||||
def run(self) -> str | None:
|
||||
"""
|
||||
Tries to collate pages from 2 single sided scans of a double sided
|
||||
document.
|
||||
|
||||
When called with a file, it checks whether or not a staging file
|
||||
exists, if not, the current file is turned into that staging file
|
||||
containing the odd numbered pages.
|
||||
|
||||
If a staging file exists, and it is not too old, the current file is
|
||||
considered to be the second part (the even numbered pages) and it will
|
||||
collate the pages of both, the pages of the second file will be added
|
||||
in reverse order, since the ADF will have scanned the pages from bottom
|
||||
to top.
|
||||
|
||||
Returns a status message on success, or raises a ConsumerError
|
||||
in case of failure.
|
||||
"""
|
||||
|
||||
if self.input_doc.mime_type == "application/pdf":
|
||||
pdf_file = self.input_doc.original_file
|
||||
elif (
|
||||
self.input_doc.mime_type == "image/tiff"
|
||||
and settings.CONSUMER_COLLATE_DOUBLE_SIDED_TIFF_SUPPORT
|
||||
):
|
||||
pdf_file = convert_from_tiff_to_pdf(
|
||||
self.input_doc.original_file,
|
||||
self.base_tmp_dir,
|
||||
)
|
||||
self.input_doc.original_file.unlink()
|
||||
else:
|
||||
raise ConsumerError(
|
||||
"Unsupported file type for collation of double-sided scans",
|
||||
)
|
||||
|
||||
staging: Path = settings.SCRATCH_DIR / STAGING_FILE_NAME
|
||||
|
||||
valid_staging_exists = False
|
||||
if staging.exists():
|
||||
stats = staging.stat()
|
||||
# if the file is older than the timeout, we don't consider
|
||||
# it valid
|
||||
if (dt.datetime.now().timestamp() - stats.st_mtime) > TIMEOUT_SECONDS:
|
||||
logger.warning("Outdated double sided staging file exists, deleting it")
|
||||
staging.unlink()
|
||||
else:
|
||||
valid_staging_exists = True
|
||||
|
||||
if valid_staging_exists:
|
||||
try:
|
||||
# Collate pages from second PDF in reverse order
|
||||
with Pdf.open(staging) as pdf1, Pdf.open(pdf_file) as pdf2:
|
||||
pdf2.pages.reverse()
|
||||
try:
|
||||
for i, page in enumerate(pdf2.pages):
|
||||
pdf1.pages.insert(2 * i + 1, page)
|
||||
except IndexError:
|
||||
raise ConsumerError(
|
||||
"This second file (even numbered pages) contains more "
|
||||
"pages than the first/odd numbered one. This means the "
|
||||
"two uploaded files don't belong to the same double-"
|
||||
"sided scan. Please retry, starting with the odd "
|
||||
"numbered pages again.",
|
||||
)
|
||||
# Merged file has the same path, but without the
|
||||
# double-sided subdir. Therefore, it is also in the
|
||||
# consumption dir and will be picked up for processing
|
||||
old_file = self.input_doc.original_file
|
||||
new_file = Path(
|
||||
*(
|
||||
part
|
||||
for part in old_file.with_name(
|
||||
f"{old_file.stem}-collated.pdf",
|
||||
).parts
|
||||
if part
|
||||
!= settings.CONSUMER_COLLATE_DOUBLE_SIDED_SUBDIR_NAME
|
||||
),
|
||||
)
|
||||
# If the user didn't create the subdirs yet, do it for them
|
||||
new_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
pdf1.save(new_file)
|
||||
logger.info("Collated documents into new file %s", new_file)
|
||||
raise StopConsumeTaskError(
|
||||
"Success. Even numbered pages of double sided scan collated "
|
||||
"with odd pages",
|
||||
)
|
||||
finally:
|
||||
# Delete staging and recently uploaded file no matter what.
|
||||
# If any error occurs, the user needs to be able to restart
|
||||
# the process from scratch; after all, the staging file
|
||||
# with the odd numbered pages might be the culprit
|
||||
pdf_file.unlink()
|
||||
staging.unlink()
|
||||
|
||||
else:
|
||||
shutil.move(pdf_file, staging)
|
||||
# update access to modification time so we know if the file
|
||||
# is outdated when another file gets uploaded
|
||||
timestamp = dt.datetime.now().timestamp()
|
||||
os.utime(staging, (timestamp, timestamp))
|
||||
logger.info(
|
||||
"Got scan with odd numbered pages of double-sided scan, moved it to %s",
|
||||
staging,
|
||||
)
|
||||
raise StopConsumeTaskError(
|
||||
"Received odd numbered pages of double sided scan, waiting up to "
|
||||
f"{TIMEOUT_MINUTES} minutes for even numbered pages",
|
||||
)
|
||||
176
src/documents/file_handling.py
Normal file
176
src/documents/file_handling.py
Normal file
@@ -0,0 +1,176 @@
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
from django.conf import settings
|
||||
|
||||
from documents.models import Document
|
||||
from documents.templating.filepath import validate_filepath_template_and_render
|
||||
from documents.templating.utils import convert_format_str_to_template_format
|
||||
|
||||
|
||||
def create_source_path_directory(source_path: Path) -> None:
|
||||
source_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
|
||||
def delete_empty_directories(directory: Path, root: Path) -> None:
|
||||
if not directory.is_dir():
|
||||
return
|
||||
|
||||
if not directory.is_relative_to(root):
|
||||
# don't do anything outside our originals folder.
|
||||
|
||||
# append os.path.set so that we avoid these cases:
|
||||
# directory = /home/originals2/test
|
||||
# root = /home/originals ("/" gets appended and startswith fails)
|
||||
return
|
||||
|
||||
# Go up in the directory hierarchy and try to delete all directories
|
||||
while directory != root:
|
||||
if not list(directory.iterdir()):
|
||||
# it's empty
|
||||
try:
|
||||
directory.rmdir()
|
||||
except OSError:
|
||||
# whatever. empty directories aren't that bad anyway.
|
||||
return
|
||||
else:
|
||||
# it's not empty.
|
||||
return
|
||||
|
||||
# go one level up
|
||||
directory = directory.parent
|
||||
|
||||
|
||||
def generate_unique_filename(doc, *, archive_filename=False) -> Path:
|
||||
"""
|
||||
Generates a unique filename for doc in settings.ORIGINALS_DIR.
|
||||
|
||||
The returned filename is guaranteed to be either the current filename
|
||||
of the document if unchanged, or a new filename that does not correspondent
|
||||
to any existing files. The function will append _01, _02, etc to the
|
||||
filename before the extension to avoid conflicts.
|
||||
|
||||
If archive_filename is True, return a unique archive filename instead.
|
||||
|
||||
"""
|
||||
if archive_filename:
|
||||
old_filename: Path | None = (
|
||||
Path(doc.archive_filename) if doc.archive_filename else None
|
||||
)
|
||||
root = settings.ARCHIVE_DIR
|
||||
else:
|
||||
old_filename = Path(doc.filename) if doc.filename else None
|
||||
root = settings.ORIGINALS_DIR
|
||||
|
||||
# If generating archive filenames, try to make a name that is similar to
|
||||
# the original filename first.
|
||||
|
||||
if archive_filename and doc.filename:
|
||||
# Generate the full path using the same logic as generate_filename
|
||||
base_generated = generate_filename(doc, archive_filename=archive_filename)
|
||||
|
||||
# Try to create a simple PDF version based on the original filename
|
||||
# but preserve any directory structure from the template
|
||||
if str(base_generated.parent) != ".":
|
||||
# Has directory structure, preserve it
|
||||
simple_pdf_name = base_generated.parent / (Path(doc.filename).stem + ".pdf")
|
||||
else:
|
||||
# No directory structure
|
||||
simple_pdf_name = Path(Path(doc.filename).stem + ".pdf")
|
||||
|
||||
if simple_pdf_name == old_filename or not (root / simple_pdf_name).exists():
|
||||
return simple_pdf_name
|
||||
|
||||
counter = 0
|
||||
|
||||
while True:
|
||||
new_filename = generate_filename(
|
||||
doc,
|
||||
counter=counter,
|
||||
archive_filename=archive_filename,
|
||||
)
|
||||
if new_filename == old_filename:
|
||||
# still the same as before.
|
||||
return new_filename
|
||||
|
||||
if (root / new_filename).exists():
|
||||
counter += 1
|
||||
else:
|
||||
return new_filename
|
||||
|
||||
|
||||
def generate_filename(
|
||||
doc: Document,
|
||||
*,
|
||||
counter=0,
|
||||
append_gpg=True,
|
||||
archive_filename=False,
|
||||
) -> Path:
|
||||
base_path: Path | None = None
|
||||
|
||||
def format_filename(document: Document, template_str: str) -> str | None:
|
||||
rendered_filename = validate_filepath_template_and_render(
|
||||
template_str,
|
||||
document,
|
||||
)
|
||||
if rendered_filename is None:
|
||||
return None
|
||||
|
||||
# Apply this setting. It could become a filter in the future (or users could use |default)
|
||||
if settings.FILENAME_FORMAT_REMOVE_NONE:
|
||||
rendered_filename = rendered_filename.replace("/-none-/", "/")
|
||||
rendered_filename = rendered_filename.replace(" -none-", "")
|
||||
rendered_filename = rendered_filename.replace("-none-", "")
|
||||
rendered_filename = rendered_filename.strip(os.sep)
|
||||
|
||||
rendered_filename = rendered_filename.replace(
|
||||
"-none-",
|
||||
"none",
|
||||
) # backward compatibility
|
||||
|
||||
return rendered_filename
|
||||
|
||||
# Determine the source of the format string
|
||||
if doc.storage_path is not None:
|
||||
filename_format = doc.storage_path.path
|
||||
elif settings.FILENAME_FORMAT is not None:
|
||||
# Maybe convert old to new style
|
||||
filename_format = convert_format_str_to_template_format(
|
||||
settings.FILENAME_FORMAT,
|
||||
)
|
||||
else:
|
||||
filename_format = None
|
||||
|
||||
# If we have one, render it
|
||||
if filename_format is not None:
|
||||
rendered_path: str | None = format_filename(doc, filename_format)
|
||||
if rendered_path:
|
||||
base_path = Path(rendered_path)
|
||||
|
||||
counter_str = f"_{counter:02}" if counter else ""
|
||||
filetype_str = ".pdf" if archive_filename else doc.file_type
|
||||
|
||||
if base_path:
|
||||
# Split the path into directory and filename parts
|
||||
directory = base_path.parent
|
||||
# Use the full name (not just stem) as the base filename
|
||||
base_filename = base_path.name
|
||||
|
||||
# Build the final filename with counter and filetype
|
||||
final_filename = f"{base_filename}{counter_str}{filetype_str}"
|
||||
|
||||
# If we have a directory component, include it
|
||||
if str(directory) != ".":
|
||||
full_path = directory / final_filename
|
||||
else:
|
||||
full_path = Path(final_filename)
|
||||
else:
|
||||
# No template, use document ID
|
||||
final_filename = f"{doc.pk:07}{counter_str}{filetype_str}"
|
||||
full_path = Path(final_filename)
|
||||
|
||||
# Add GPG extension if needed
|
||||
if append_gpg and doc.storage_type == doc.STORAGE_TYPE_GPG:
|
||||
full_path = full_path.with_suffix(full_path.suffix + ".gpg")
|
||||
|
||||
return full_path
|
||||
971
src/documents/filters.py
Normal file
971
src/documents/filters.py
Normal file
@@ -0,0 +1,971 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import functools
|
||||
import inspect
|
||||
import json
|
||||
import operator
|
||||
from contextlib import contextmanager
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from django.contrib.contenttypes.models import ContentType
|
||||
from django.db.models import Case
|
||||
from django.db.models import CharField
|
||||
from django.db.models import Count
|
||||
from django.db.models import Exists
|
||||
from django.db.models import IntegerField
|
||||
from django.db.models import OuterRef
|
||||
from django.db.models import Q
|
||||
from django.db.models import Subquery
|
||||
from django.db.models import Sum
|
||||
from django.db.models import Value
|
||||
from django.db.models import When
|
||||
from django.db.models.functions import Cast
|
||||
from django.utils.translation import gettext_lazy as _
|
||||
from django_filters import DateFilter
|
||||
from django_filters.rest_framework import BooleanFilter
|
||||
from django_filters.rest_framework import Filter
|
||||
from django_filters.rest_framework import FilterSet
|
||||
from drf_spectacular.utils import extend_schema_field
|
||||
from guardian.utils import get_group_obj_perms_model
|
||||
from guardian.utils import get_user_obj_perms_model
|
||||
from rest_framework import serializers
|
||||
from rest_framework.filters import OrderingFilter
|
||||
from rest_framework_guardian.filters import ObjectPermissionsFilter
|
||||
|
||||
from documents.models import Correspondent
|
||||
from documents.models import CustomField
|
||||
from documents.models import CustomFieldInstance
|
||||
from documents.models import Document
|
||||
from documents.models import DocumentType
|
||||
from documents.models import PaperlessTask
|
||||
from documents.models import ShareLink
|
||||
from documents.models import StoragePath
|
||||
from documents.models import Tag
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Callable
|
||||
|
||||
CHAR_KWARGS = ["istartswith", "iendswith", "icontains", "iexact"]
|
||||
ID_KWARGS = ["in", "exact"]
|
||||
INT_KWARGS = ["exact", "gt", "gte", "lt", "lte", "isnull"]
|
||||
DATE_KWARGS = [
|
||||
"year",
|
||||
"month",
|
||||
"day",
|
||||
"gt",
|
||||
"gte",
|
||||
"lt",
|
||||
"lte",
|
||||
]
|
||||
DATETIME_KWARGS = [
|
||||
"year",
|
||||
"month",
|
||||
"day",
|
||||
"date__gt",
|
||||
"date__gte",
|
||||
"gt",
|
||||
"gte",
|
||||
"date__lt",
|
||||
"date__lte",
|
||||
"lt",
|
||||
"lte",
|
||||
]
|
||||
|
||||
CUSTOM_FIELD_QUERY_MAX_DEPTH = 10
|
||||
CUSTOM_FIELD_QUERY_MAX_ATOMS = 20
|
||||
|
||||
|
||||
class CorrespondentFilterSet(FilterSet):
|
||||
class Meta:
|
||||
model = Correspondent
|
||||
fields = {
|
||||
"id": ID_KWARGS,
|
||||
"name": CHAR_KWARGS,
|
||||
}
|
||||
|
||||
|
||||
class TagFilterSet(FilterSet):
|
||||
class Meta:
|
||||
model = Tag
|
||||
fields = {
|
||||
"id": ID_KWARGS,
|
||||
"name": CHAR_KWARGS,
|
||||
}
|
||||
|
||||
|
||||
class DocumentTypeFilterSet(FilterSet):
|
||||
class Meta:
|
||||
model = DocumentType
|
||||
fields = {
|
||||
"id": ID_KWARGS,
|
||||
"name": CHAR_KWARGS,
|
||||
}
|
||||
|
||||
|
||||
class StoragePathFilterSet(FilterSet):
|
||||
class Meta:
|
||||
model = StoragePath
|
||||
fields = {
|
||||
"id": ID_KWARGS,
|
||||
"name": CHAR_KWARGS,
|
||||
"path": CHAR_KWARGS,
|
||||
}
|
||||
|
||||
|
||||
class ObjectFilter(Filter):
|
||||
def __init__(self, *, exclude=False, in_list=False, field_name=""):
|
||||
super().__init__()
|
||||
self.exclude = exclude
|
||||
self.in_list = in_list
|
||||
self.field_name = field_name
|
||||
|
||||
def filter(self, qs, value):
|
||||
if not value:
|
||||
return qs
|
||||
|
||||
try:
|
||||
object_ids = [int(x) for x in value.split(",")]
|
||||
except ValueError:
|
||||
return qs
|
||||
|
||||
if self.in_list:
|
||||
qs = qs.filter(**{f"{self.field_name}__id__in": object_ids}).distinct()
|
||||
else:
|
||||
for obj_id in object_ids:
|
||||
if self.exclude:
|
||||
qs = qs.exclude(**{f"{self.field_name}__id": obj_id})
|
||||
else:
|
||||
qs = qs.filter(**{f"{self.field_name}__id": obj_id})
|
||||
|
||||
return qs
|
||||
|
||||
|
||||
@extend_schema_field(serializers.BooleanField)
|
||||
class InboxFilter(Filter):
|
||||
def filter(self, qs, value):
|
||||
if value == "true":
|
||||
return qs.filter(tags__is_inbox_tag=True)
|
||||
elif value == "false":
|
||||
return qs.exclude(tags__is_inbox_tag=True)
|
||||
else:
|
||||
return qs
|
||||
|
||||
|
||||
@extend_schema_field(serializers.CharField)
|
||||
class TitleContentFilter(Filter):
|
||||
def filter(self, qs, value):
|
||||
if value:
|
||||
return qs.filter(Q(title__icontains=value) | Q(content__icontains=value))
|
||||
else:
|
||||
return qs
|
||||
|
||||
|
||||
@extend_schema_field(serializers.BooleanField)
|
||||
class SharedByUser(Filter):
|
||||
def filter(self, qs, value):
|
||||
ctype = ContentType.objects.get_for_model(self.model)
|
||||
UserObjectPermission = get_user_obj_perms_model()
|
||||
GroupObjectPermission = get_group_obj_perms_model()
|
||||
# see https://github.com/paperless-ngx/paperless-ngx/issues/5392, we limit subqueries
|
||||
# to 1 because Postgres doesn't like returning > 1 row, but all we care about is > 0
|
||||
return (
|
||||
qs.filter(
|
||||
owner_id=value,
|
||||
)
|
||||
.annotate(
|
||||
num_shared_users=Count(
|
||||
UserObjectPermission.objects.filter(
|
||||
content_type=ctype,
|
||||
object_pk=Cast(OuterRef("pk"), CharField()),
|
||||
).values("user_id")[:1],
|
||||
),
|
||||
)
|
||||
.annotate(
|
||||
num_shared_groups=Count(
|
||||
GroupObjectPermission.objects.filter(
|
||||
content_type=ctype,
|
||||
object_pk=Cast(OuterRef("pk"), CharField()),
|
||||
).values("group_id")[:1],
|
||||
),
|
||||
)
|
||||
.filter(
|
||||
Q(num_shared_users__gt=0) | Q(num_shared_groups__gt=0),
|
||||
)
|
||||
if value is not None
|
||||
else qs
|
||||
)
|
||||
|
||||
|
||||
class CustomFieldFilterSet(FilterSet):
|
||||
class Meta:
|
||||
model = CustomField
|
||||
fields = {
|
||||
"id": ID_KWARGS,
|
||||
"name": CHAR_KWARGS,
|
||||
}
|
||||
|
||||
|
||||
@extend_schema_field(serializers.CharField)
|
||||
class CustomFieldsFilter(Filter):
|
||||
def filter(self, qs, value):
|
||||
if value:
|
||||
fields_with_matching_selects = CustomField.objects.filter(
|
||||
extra_data__icontains=value,
|
||||
)
|
||||
option_ids = []
|
||||
if fields_with_matching_selects.count() > 0:
|
||||
for field in fields_with_matching_selects:
|
||||
options = field.extra_data.get("select_options", [])
|
||||
for _, option in enumerate(options):
|
||||
if option.get("label").lower().find(value.lower()) != -1:
|
||||
option_ids.extend([option.get("id")])
|
||||
return (
|
||||
qs.filter(custom_fields__field__name__icontains=value)
|
||||
| qs.filter(custom_fields__value_text__icontains=value)
|
||||
| qs.filter(custom_fields__value_bool__icontains=value)
|
||||
| qs.filter(custom_fields__value_int__icontains=value)
|
||||
| qs.filter(custom_fields__value_float__icontains=value)
|
||||
| qs.filter(custom_fields__value_date__icontains=value)
|
||||
| qs.filter(custom_fields__value_url__icontains=value)
|
||||
| qs.filter(custom_fields__value_monetary__icontains=value)
|
||||
| qs.filter(custom_fields__value_document_ids__icontains=value)
|
||||
| qs.filter(custom_fields__value_select__in=option_ids)
|
||||
| qs.filter(custom_fields__value_long_text__icontains=value)
|
||||
)
|
||||
else:
|
||||
return qs
|
||||
|
||||
|
||||
class MimeTypeFilter(Filter):
|
||||
def filter(self, qs, value):
|
||||
if value:
|
||||
return qs.filter(mime_type__icontains=value)
|
||||
else:
|
||||
return qs
|
||||
|
||||
|
||||
class SelectField(serializers.CharField):
|
||||
def __init__(self, custom_field: CustomField):
|
||||
self._options = custom_field.extra_data["select_options"]
|
||||
super().__init__(max_length=16)
|
||||
|
||||
def to_internal_value(self, data):
|
||||
# If the supplied value is the option label instead of the ID
|
||||
try:
|
||||
data = next(
|
||||
option.get("id")
|
||||
for option in self._options
|
||||
if option.get("label") == data
|
||||
)
|
||||
except StopIteration:
|
||||
pass
|
||||
return super().to_internal_value(data)
|
||||
|
||||
|
||||
def handle_validation_prefix(func: Callable):
|
||||
"""
|
||||
Catch ValidationErrors raised by the wrapped function
|
||||
and add a prefix to the exception detail to track what causes the exception,
|
||||
similar to nested serializers.
|
||||
"""
|
||||
|
||||
def wrapper(*args, validation_prefix=None, **kwargs):
|
||||
try:
|
||||
return func(*args, **kwargs)
|
||||
except serializers.ValidationError as e:
|
||||
raise serializers.ValidationError({validation_prefix: e.detail})
|
||||
|
||||
# Update the signature to include the validation_prefix argument
|
||||
old_sig = inspect.signature(func)
|
||||
new_param = inspect.Parameter("validation_prefix", inspect.Parameter.KEYWORD_ONLY)
|
||||
new_sig = old_sig.replace(parameters=[*old_sig.parameters.values(), new_param])
|
||||
|
||||
# Apply functools.wraps and manually set the new signature
|
||||
functools.update_wrapper(wrapper, func)
|
||||
wrapper.__signature__ = new_sig
|
||||
|
||||
return wrapper
|
||||
|
||||
|
||||
class CustomFieldQueryParser:
|
||||
EXPR_BY_CATEGORY = {
|
||||
"basic": ["exact", "in", "isnull", "exists"],
|
||||
"string": [
|
||||
"icontains",
|
||||
"istartswith",
|
||||
"iendswith",
|
||||
],
|
||||
"arithmetic": [
|
||||
"gt",
|
||||
"gte",
|
||||
"lt",
|
||||
"lte",
|
||||
"range",
|
||||
],
|
||||
"containment": ["contains"],
|
||||
}
|
||||
|
||||
SUPPORTED_EXPR_CATEGORIES = {
|
||||
CustomField.FieldDataType.STRING: ("basic", "string"),
|
||||
CustomField.FieldDataType.URL: ("basic", "string"),
|
||||
CustomField.FieldDataType.DATE: ("basic", "arithmetic"),
|
||||
CustomField.FieldDataType.BOOL: ("basic",),
|
||||
CustomField.FieldDataType.INT: ("basic", "arithmetic"),
|
||||
CustomField.FieldDataType.FLOAT: ("basic", "arithmetic"),
|
||||
CustomField.FieldDataType.MONETARY: ("basic", "string", "arithmetic"),
|
||||
CustomField.FieldDataType.DOCUMENTLINK: ("basic", "containment"),
|
||||
CustomField.FieldDataType.SELECT: ("basic",),
|
||||
CustomField.FieldDataType.LONG_TEXT: ("basic", "string"),
|
||||
}
|
||||
|
||||
DATE_COMPONENTS = [
|
||||
"year",
|
||||
"iso_year",
|
||||
"month",
|
||||
"day",
|
||||
"week",
|
||||
"week_day",
|
||||
"iso_week_day",
|
||||
"quarter",
|
||||
]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
validation_prefix,
|
||||
max_query_depth=10,
|
||||
max_atom_count=20,
|
||||
) -> None:
|
||||
"""
|
||||
A helper class that parses the query string into a `django.db.models.Q` for filtering
|
||||
documents based on custom field values.
|
||||
|
||||
The syntax of the query expression is illustrated with the below pseudo code rules:
|
||||
1. parse([`custom_field`, "exists", true]):
|
||||
matches documents with Q(custom_fields__field=`custom_field`)
|
||||
2. parse([`custom_field`, "exists", false]):
|
||||
matches documents with ~Q(custom_fields__field=`custom_field`)
|
||||
3. parse([`custom_field`, `op`, `value`]):
|
||||
matches documents with
|
||||
Q(custom_fields__field=`custom_field`, custom_fields__value_`type`__`op`= `value`)
|
||||
4. parse(["AND", [`q0`, `q1`, ..., `qn`]])
|
||||
-> parse(`q0`) & parse(`q1`) & ... & parse(`qn`)
|
||||
5. parse(["OR", [`q0`, `q1`, ..., `qn`]])
|
||||
-> parse(`q0`) | parse(`q1`) | ... | parse(`qn`)
|
||||
6. parse(["NOT", `q`])
|
||||
-> ~parse(`q`)
|
||||
|
||||
Args:
|
||||
validation_prefix: Used to generate the ValidationError message.
|
||||
max_query_depth: Limits the maximum nesting depth of queries.
|
||||
max_atom_count: Limits the maximum number of atoms (i.e., rule 1, 2, 3) in the query.
|
||||
|
||||
`max_query_depth` and `max_atom_count` can be set to guard against generating arbitrarily
|
||||
complex SQL queries.
|
||||
"""
|
||||
self._custom_fields: dict[int | str, CustomField] = {}
|
||||
self._validation_prefix = validation_prefix
|
||||
# Dummy ModelSerializer used to convert a Django models.Field to serializers.Field.
|
||||
self._model_serializer = serializers.ModelSerializer()
|
||||
# Used for sanity check
|
||||
self._max_query_depth = max_query_depth
|
||||
self._max_atom_count = max_atom_count
|
||||
self._current_depth = 0
|
||||
self._atom_count = 0
|
||||
# The set of annotations that we need to apply to the queryset
|
||||
self._annotations = {}
|
||||
|
||||
def parse(self, query: str) -> tuple[Q, dict[str, Count]]:
|
||||
"""
|
||||
Parses the query string into a `django.db.models.Q`
|
||||
and a set of annotations to be applied to the queryset.
|
||||
"""
|
||||
try:
|
||||
expr = json.loads(query)
|
||||
except json.JSONDecodeError:
|
||||
raise serializers.ValidationError(
|
||||
{self._validation_prefix: [_("Value must be valid JSON.")]},
|
||||
)
|
||||
return (
|
||||
self._parse_expr(expr, validation_prefix=self._validation_prefix),
|
||||
self._annotations,
|
||||
)
|
||||
|
||||
@handle_validation_prefix
|
||||
def _parse_expr(self, expr) -> Q:
|
||||
"""
|
||||
Applies rule (1, 2, 3) or (4, 5, 6) based on the length of the expr.
|
||||
"""
|
||||
with self._track_query_depth():
|
||||
if isinstance(expr, list | tuple):
|
||||
if len(expr) == 2:
|
||||
return self._parse_logical_expr(*expr)
|
||||
elif len(expr) == 3:
|
||||
return self._parse_atom(*expr)
|
||||
raise serializers.ValidationError(
|
||||
[_("Invalid custom field query expression")],
|
||||
)
|
||||
|
||||
@handle_validation_prefix
|
||||
def _parse_expr_list(self, exprs) -> list[Q]:
|
||||
"""
|
||||
Handles [`q0`, `q1`, ..., `qn`] in rule 4 & 5.
|
||||
"""
|
||||
if not isinstance(exprs, list | tuple) or not exprs:
|
||||
raise serializers.ValidationError(
|
||||
[_("Invalid expression list. Must be nonempty.")],
|
||||
)
|
||||
return [
|
||||
self._parse_expr(expr, validation_prefix=i) for i, expr in enumerate(exprs)
|
||||
]
|
||||
|
||||
def _parse_logical_expr(self, op, args) -> Q:
|
||||
"""
|
||||
Handles rule 4, 5, 6.
|
||||
"""
|
||||
op_lower = op.lower()
|
||||
|
||||
if op_lower == "not":
|
||||
return ~self._parse_expr(args, validation_prefix=1)
|
||||
|
||||
if op_lower == "and":
|
||||
op_func = operator.and_
|
||||
elif op_lower == "or":
|
||||
op_func = operator.or_
|
||||
else:
|
||||
raise serializers.ValidationError(
|
||||
{"0": [_("Invalid logical operator {op!r}").format(op=op)]},
|
||||
)
|
||||
|
||||
qs = self._parse_expr_list(args, validation_prefix="1")
|
||||
return functools.reduce(op_func, qs)
|
||||
|
||||
def _parse_atom(self, id_or_name, op, value) -> Q:
|
||||
"""
|
||||
Handles rule 1, 2, 3.
|
||||
"""
|
||||
# Guard against queries with too many conditions.
|
||||
self._atom_count += 1
|
||||
if self._atom_count > self._max_atom_count:
|
||||
raise serializers.ValidationError(
|
||||
[_("Maximum number of query conditions exceeded.")],
|
||||
)
|
||||
|
||||
custom_field = self._get_custom_field(id_or_name, validation_prefix="0")
|
||||
op = self._validate_atom_op(custom_field, op, validation_prefix="1")
|
||||
value = self._validate_atom_value(
|
||||
custom_field,
|
||||
op,
|
||||
value,
|
||||
validation_prefix="2",
|
||||
)
|
||||
|
||||
# Needed because not all DB backends support Array __contains
|
||||
if (
|
||||
custom_field.data_type == CustomField.FieldDataType.DOCUMENTLINK
|
||||
and op == "contains"
|
||||
):
|
||||
return self._parse_atom_doc_link_contains(custom_field, value)
|
||||
|
||||
value_field_name = CustomFieldInstance.get_value_field_name(
|
||||
custom_field.data_type,
|
||||
)
|
||||
if (
|
||||
custom_field.data_type == CustomField.FieldDataType.MONETARY
|
||||
and op in self.EXPR_BY_CATEGORY["arithmetic"]
|
||||
):
|
||||
value_field_name = "value_monetary_amount"
|
||||
has_field = Q(custom_fields__field=custom_field)
|
||||
|
||||
# We need to use an annotation here because different atoms
|
||||
# might be referring to different instances of custom fields.
|
||||
annotation_name = f"_custom_field_filter_{len(self._annotations)}"
|
||||
|
||||
# Our special exists operator.
|
||||
if op == "exists":
|
||||
annotation = Count("custom_fields", filter=has_field)
|
||||
# A Document should have > 0 match if it has this field, or 0 if doesn't.
|
||||
query_op = "gt" if value else "exact"
|
||||
query = Q(**{f"{annotation_name}__{query_op}": 0})
|
||||
else:
|
||||
# Check if 1) custom field name matches, and 2) value satisfies condition
|
||||
field_filter = has_field & Q(
|
||||
**{f"custom_fields__{value_field_name}__{op}": value},
|
||||
)
|
||||
# Annotate how many matching custom fields each document has
|
||||
annotation = Count("custom_fields", filter=field_filter)
|
||||
# Filter document by count
|
||||
query = Q(**{f"{annotation_name}__gt": 0})
|
||||
|
||||
self._annotations[annotation_name] = annotation
|
||||
return query
|
||||
|
||||
@handle_validation_prefix
|
||||
def _get_custom_field(self, id_or_name):
|
||||
"""Get the CustomField instance by id or name."""
|
||||
if id_or_name in self._custom_fields:
|
||||
return self._custom_fields[id_or_name]
|
||||
|
||||
kwargs = (
|
||||
{"id": id_or_name} if isinstance(id_or_name, int) else {"name": id_or_name}
|
||||
)
|
||||
try:
|
||||
custom_field = CustomField.objects.get(**kwargs)
|
||||
except CustomField.DoesNotExist:
|
||||
raise serializers.ValidationError(
|
||||
[_("{name!r} is not a valid custom field.").format(name=id_or_name)],
|
||||
)
|
||||
self._custom_fields[custom_field.id] = custom_field
|
||||
self._custom_fields[custom_field.name] = custom_field
|
||||
return custom_field
|
||||
|
||||
@staticmethod
|
||||
def _split_op(full_op):
|
||||
*prefix, op = str(full_op).rsplit("__", maxsplit=1)
|
||||
prefix = prefix[0] if prefix else None
|
||||
return prefix, op
|
||||
|
||||
@handle_validation_prefix
|
||||
def _validate_atom_op(self, custom_field, raw_op):
|
||||
"""Check if the `op` is compatible with the type of the custom field."""
|
||||
prefix, op = self._split_op(raw_op)
|
||||
|
||||
# Check if the operator is supported for the current data_type.
|
||||
supported = False
|
||||
for category in self.SUPPORTED_EXPR_CATEGORIES[custom_field.data_type]:
|
||||
if op in self.EXPR_BY_CATEGORY[category]:
|
||||
supported = True
|
||||
break
|
||||
|
||||
# Check prefix
|
||||
if prefix is not None:
|
||||
if (
|
||||
prefix in self.DATE_COMPONENTS
|
||||
and custom_field.data_type == CustomField.FieldDataType.DATE
|
||||
):
|
||||
pass # ok - e.g., "year__exact" for date field
|
||||
else:
|
||||
supported = False # anything else is invalid
|
||||
|
||||
if not supported:
|
||||
raise serializers.ValidationError(
|
||||
[
|
||||
_("{data_type} does not support query expr {expr!r}.").format(
|
||||
data_type=custom_field.data_type,
|
||||
expr=raw_op,
|
||||
),
|
||||
],
|
||||
)
|
||||
|
||||
return raw_op
|
||||
|
||||
def _get_serializer_field(self, custom_field, full_op):
|
||||
"""Return a serializers.Field for value validation."""
|
||||
prefix, op = self._split_op(full_op)
|
||||
field = None
|
||||
|
||||
if op in ("isnull", "exists"):
|
||||
# `isnull` takes either True or False regardless of the data_type.
|
||||
field = serializers.BooleanField()
|
||||
elif (
|
||||
custom_field.data_type == CustomField.FieldDataType.DATE
|
||||
and prefix in self.DATE_COMPONENTS
|
||||
):
|
||||
# DateField admits queries in the form of `year__exact`, etc. These take integers.
|
||||
field = serializers.IntegerField()
|
||||
elif custom_field.data_type == CustomField.FieldDataType.DOCUMENTLINK:
|
||||
# We can be more specific here and make sure the value is a list.
|
||||
field = serializers.ListField(child=serializers.IntegerField())
|
||||
elif custom_field.data_type == CustomField.FieldDataType.SELECT:
|
||||
# We use this custom field to permit SELECT option names.
|
||||
field = SelectField(custom_field)
|
||||
elif custom_field.data_type == CustomField.FieldDataType.URL:
|
||||
# For URL fields we don't need to be strict about validation (e.g., for istartswith).
|
||||
field = serializers.CharField()
|
||||
else:
|
||||
# The general case: inferred from the corresponding field in CustomFieldInstance.
|
||||
value_field_name = CustomFieldInstance.get_value_field_name(
|
||||
custom_field.data_type,
|
||||
)
|
||||
model_field = CustomFieldInstance._meta.get_field(value_field_name)
|
||||
field_name = model_field.deconstruct()[0]
|
||||
field_class, field_kwargs = self._model_serializer.build_standard_field(
|
||||
field_name,
|
||||
model_field,
|
||||
)
|
||||
field = field_class(**field_kwargs)
|
||||
field.allow_null = False
|
||||
|
||||
# Need to set allow_blank manually because of the inconsistency in CustomFieldInstance validation.
|
||||
# See https://github.com/paperless-ngx/paperless-ngx/issues/7361.
|
||||
if isinstance(field, serializers.CharField):
|
||||
field.allow_blank = True
|
||||
|
||||
if op == "in":
|
||||
# `in` takes a list of values.
|
||||
field = serializers.ListField(child=field, allow_empty=False)
|
||||
elif op == "range":
|
||||
# `range` takes a list of values, i.e., [start, end].
|
||||
field = serializers.ListField(
|
||||
child=field,
|
||||
min_length=2,
|
||||
max_length=2,
|
||||
)
|
||||
|
||||
return field
|
||||
|
||||
@handle_validation_prefix
|
||||
def _validate_atom_value(self, custom_field, op, value):
|
||||
"""Check if `value` is valid for the custom field and `op`. Returns the validated value."""
|
||||
serializer_field = self._get_serializer_field(custom_field, op)
|
||||
return serializer_field.run_validation(value)
|
||||
|
||||
def _parse_atom_doc_link_contains(self, custom_field, value) -> Q:
|
||||
"""
|
||||
Handles document link `contains` in a way that is supported by all DB backends.
|
||||
"""
|
||||
|
||||
# If the value is an empty set,
|
||||
# this is trivially true for any document with not null document links.
|
||||
if not value:
|
||||
return Q(
|
||||
custom_fields__field=custom_field,
|
||||
custom_fields__value_document_ids__isnull=False,
|
||||
)
|
||||
|
||||
# First we look up reverse links from the requested documents.
|
||||
links = CustomFieldInstance.objects.filter(
|
||||
document_id__in=value,
|
||||
field__data_type=CustomField.FieldDataType.DOCUMENTLINK,
|
||||
)
|
||||
|
||||
# Check if any of the requested IDs are missing.
|
||||
missing_ids = set(value) - set(link.document_id for link in links)
|
||||
if missing_ids:
|
||||
# The result should be an empty set in this case.
|
||||
return Q(id__in=[])
|
||||
|
||||
# Take the intersection of the reverse links - this should be what we are looking for.
|
||||
document_ids_we_want = functools.reduce(
|
||||
operator.and_,
|
||||
(set(link.value_document_ids) for link in links),
|
||||
)
|
||||
|
||||
return Q(id__in=document_ids_we_want)
|
||||
|
||||
@contextmanager
|
||||
def _track_query_depth(self):
|
||||
# guard against queries that are too deeply nested
|
||||
self._current_depth += 1
|
||||
if self._current_depth > self._max_query_depth:
|
||||
raise serializers.ValidationError([_("Maximum nesting depth exceeded.")])
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
self._current_depth -= 1
|
||||
|
||||
|
||||
@extend_schema_field(serializers.CharField)
|
||||
class CustomFieldQueryFilter(Filter):
|
||||
def __init__(self, validation_prefix):
|
||||
"""
|
||||
A filter that filters documents based on custom field name and value.
|
||||
|
||||
Args:
|
||||
validation_prefix: Used to generate the ValidationError message.
|
||||
"""
|
||||
super().__init__()
|
||||
self._validation_prefix = validation_prefix
|
||||
|
||||
def filter(self, qs, value):
|
||||
if not value:
|
||||
return qs
|
||||
|
||||
parser = CustomFieldQueryParser(
|
||||
self._validation_prefix,
|
||||
max_query_depth=CUSTOM_FIELD_QUERY_MAX_DEPTH,
|
||||
max_atom_count=CUSTOM_FIELD_QUERY_MAX_ATOMS,
|
||||
)
|
||||
q, annotations = parser.parse(value)
|
||||
|
||||
return qs.annotate(**annotations).filter(q)
|
||||
|
||||
|
||||
class DocumentFilterSet(FilterSet):
|
||||
is_tagged = BooleanFilter(
|
||||
label="Is tagged",
|
||||
field_name="tags",
|
||||
lookup_expr="isnull",
|
||||
exclude=True,
|
||||
)
|
||||
|
||||
tags__id__all = ObjectFilter(field_name="tags")
|
||||
|
||||
tags__id__none = ObjectFilter(field_name="tags", exclude=True)
|
||||
|
||||
tags__id__in = ObjectFilter(field_name="tags", in_list=True)
|
||||
|
||||
correspondent__id__none = ObjectFilter(field_name="correspondent", exclude=True)
|
||||
|
||||
document_type__id__none = ObjectFilter(field_name="document_type", exclude=True)
|
||||
|
||||
storage_path__id__none = ObjectFilter(field_name="storage_path", exclude=True)
|
||||
|
||||
is_in_inbox = InboxFilter()
|
||||
|
||||
title_content = TitleContentFilter()
|
||||
|
||||
owner__id__none = ObjectFilter(field_name="owner", exclude=True)
|
||||
|
||||
custom_fields__icontains = CustomFieldsFilter()
|
||||
|
||||
custom_fields__id__all = ObjectFilter(field_name="custom_fields__field")
|
||||
|
||||
custom_fields__id__none = ObjectFilter(
|
||||
field_name="custom_fields__field",
|
||||
exclude=True,
|
||||
)
|
||||
|
||||
custom_fields__id__in = ObjectFilter(
|
||||
field_name="custom_fields__field",
|
||||
in_list=True,
|
||||
)
|
||||
|
||||
has_custom_fields = BooleanFilter(
|
||||
label="Has custom field",
|
||||
field_name="custom_fields",
|
||||
lookup_expr="isnull",
|
||||
exclude=True,
|
||||
)
|
||||
|
||||
custom_field_query = CustomFieldQueryFilter("custom_field_query")
|
||||
|
||||
shared_by__id = SharedByUser()
|
||||
|
||||
mime_type = MimeTypeFilter()
|
||||
|
||||
# Backwards compatibility
|
||||
created__date__gt = DateFilter(field_name="created", lookup_expr="gt")
|
||||
created__date__gte = DateFilter(field_name="created", lookup_expr="gte")
|
||||
created__date__lt = DateFilter(field_name="created", lookup_expr="lt")
|
||||
created__date__lte = DateFilter(field_name="created", lookup_expr="lte")
|
||||
|
||||
class Meta:
|
||||
model = Document
|
||||
fields = {
|
||||
"id": ID_KWARGS,
|
||||
"title": CHAR_KWARGS,
|
||||
"content": CHAR_KWARGS,
|
||||
"archive_serial_number": INT_KWARGS,
|
||||
"created": DATE_KWARGS,
|
||||
"added": DATETIME_KWARGS,
|
||||
"modified": DATETIME_KWARGS,
|
||||
"original_filename": CHAR_KWARGS,
|
||||
"checksum": CHAR_KWARGS,
|
||||
"correspondent": ["isnull"],
|
||||
"correspondent__id": ID_KWARGS,
|
||||
"correspondent__name": CHAR_KWARGS,
|
||||
"tags__id": ID_KWARGS,
|
||||
"tags__name": CHAR_KWARGS,
|
||||
"document_type": ["isnull"],
|
||||
"document_type__id": ID_KWARGS,
|
||||
"document_type__name": CHAR_KWARGS,
|
||||
"storage_path": ["isnull"],
|
||||
"storage_path__id": ID_KWARGS,
|
||||
"storage_path__name": CHAR_KWARGS,
|
||||
"owner": ["isnull"],
|
||||
"owner__id": ID_KWARGS,
|
||||
"custom_fields": ["icontains"],
|
||||
}
|
||||
|
||||
|
||||
class ShareLinkFilterSet(FilterSet):
|
||||
class Meta:
|
||||
model = ShareLink
|
||||
fields = {
|
||||
"created": DATETIME_KWARGS,
|
||||
"expiration": DATETIME_KWARGS,
|
||||
}
|
||||
|
||||
|
||||
class PaperlessTaskFilterSet(FilterSet):
|
||||
acknowledged = BooleanFilter(
|
||||
label="Acknowledged",
|
||||
field_name="acknowledged",
|
||||
)
|
||||
|
||||
class Meta:
|
||||
model = PaperlessTask
|
||||
fields = {
|
||||
"type": ["exact"],
|
||||
"task_name": ["exact"],
|
||||
"status": ["exact"],
|
||||
}
|
||||
|
||||
|
||||
class ObjectOwnedOrGrantedPermissionsFilter(ObjectPermissionsFilter):
|
||||
"""
|
||||
A filter backend that limits results to those where the requesting user
|
||||
has read object level permissions, owns the objects, or objects without
|
||||
an owner (for backwards compat)
|
||||
"""
|
||||
|
||||
def filter_queryset(self, request, queryset, view):
|
||||
objects_with_perms = super().filter_queryset(request, queryset, view)
|
||||
objects_owned = queryset.filter(owner=request.user)
|
||||
objects_unowned = queryset.filter(owner__isnull=True)
|
||||
return objects_with_perms | objects_owned | objects_unowned
|
||||
|
||||
|
||||
class ObjectOwnedPermissionsFilter(ObjectPermissionsFilter):
|
||||
"""
|
||||
A filter backend that limits results to those where the requesting user
|
||||
owns the objects or objects without an owner (for backwards compat)
|
||||
"""
|
||||
|
||||
def filter_queryset(self, request, queryset, view):
|
||||
if request.user.is_superuser:
|
||||
return queryset
|
||||
objects_owned = queryset.filter(owner=request.user)
|
||||
objects_unowned = queryset.filter(owner__isnull=True)
|
||||
return objects_owned | objects_unowned
|
||||
|
||||
|
||||
class DocumentsOrderingFilter(OrderingFilter):
|
||||
field_name = "ordering"
|
||||
prefix = "custom_field_"
|
||||
|
||||
def filter_queryset(self, request, queryset, view):
|
||||
param = request.query_params.get("ordering")
|
||||
if param and self.prefix in param:
|
||||
custom_field_id = int(param.split(self.prefix)[1])
|
||||
try:
|
||||
field = CustomField.objects.get(pk=custom_field_id)
|
||||
except CustomField.DoesNotExist:
|
||||
raise serializers.ValidationError(
|
||||
{self.prefix + str(custom_field_id): [_("Custom field not found")]},
|
||||
)
|
||||
|
||||
annotation = None
|
||||
match field.data_type:
|
||||
case (
|
||||
CustomField.FieldDataType.STRING
|
||||
| CustomField.FieldDataType.LONG_TEXT
|
||||
):
|
||||
annotation = Subquery(
|
||||
CustomFieldInstance.objects.filter(
|
||||
document_id=OuterRef("id"),
|
||||
field_id=custom_field_id,
|
||||
).values("value_text")[:1],
|
||||
)
|
||||
case CustomField.FieldDataType.INT:
|
||||
annotation = Subquery(
|
||||
CustomFieldInstance.objects.filter(
|
||||
document_id=OuterRef("id"),
|
||||
field_id=custom_field_id,
|
||||
).values("value_int")[:1],
|
||||
)
|
||||
case CustomField.FieldDataType.FLOAT:
|
||||
annotation = Subquery(
|
||||
CustomFieldInstance.objects.filter(
|
||||
document_id=OuterRef("id"),
|
||||
field_id=custom_field_id,
|
||||
).values("value_float")[:1],
|
||||
)
|
||||
case CustomField.FieldDataType.DATE:
|
||||
annotation = Subquery(
|
||||
CustomFieldInstance.objects.filter(
|
||||
document_id=OuterRef("id"),
|
||||
field_id=custom_field_id,
|
||||
).values("value_date")[:1],
|
||||
)
|
||||
case CustomField.FieldDataType.MONETARY:
|
||||
annotation = Subquery(
|
||||
CustomFieldInstance.objects.filter(
|
||||
document_id=OuterRef("id"),
|
||||
field_id=custom_field_id,
|
||||
).values("value_monetary_amount")[:1],
|
||||
)
|
||||
case CustomField.FieldDataType.SELECT:
|
||||
# Select options are a little more complicated since the value is the id of the option, not
|
||||
# the label. Additionally, to support sqlite we can't use StringAgg, so we need to create a
|
||||
# case statement for each option, setting the value to the index of the option in a list
|
||||
# sorted by label, and then summing the results to give a single value for the annotation
|
||||
|
||||
select_options = sorted(
|
||||
field.extra_data.get("select_options", []),
|
||||
key=lambda x: x.get("label"),
|
||||
)
|
||||
whens = [
|
||||
When(
|
||||
custom_fields__field_id=custom_field_id,
|
||||
custom_fields__value_select=option.get("id"),
|
||||
then=Value(idx, output_field=IntegerField()),
|
||||
)
|
||||
for idx, option in enumerate(select_options)
|
||||
]
|
||||
whens.append(
|
||||
When(
|
||||
custom_fields__field_id=custom_field_id,
|
||||
custom_fields__value_select__isnull=True,
|
||||
then=Value(
|
||||
len(select_options),
|
||||
output_field=IntegerField(),
|
||||
),
|
||||
),
|
||||
)
|
||||
annotation = Sum(
|
||||
Case(
|
||||
*whens,
|
||||
default=Value(0),
|
||||
output_field=IntegerField(),
|
||||
),
|
||||
)
|
||||
case CustomField.FieldDataType.DOCUMENTLINK:
|
||||
annotation = Subquery(
|
||||
CustomFieldInstance.objects.filter(
|
||||
document_id=OuterRef("id"),
|
||||
field_id=custom_field_id,
|
||||
).values("value_document_ids")[:1],
|
||||
)
|
||||
case CustomField.FieldDataType.URL:
|
||||
annotation = Subquery(
|
||||
CustomFieldInstance.objects.filter(
|
||||
document_id=OuterRef("id"),
|
||||
field_id=custom_field_id,
|
||||
).values("value_url")[:1],
|
||||
)
|
||||
case CustomField.FieldDataType.BOOL:
|
||||
annotation = Subquery(
|
||||
CustomFieldInstance.objects.filter(
|
||||
document_id=OuterRef("id"),
|
||||
field_id=custom_field_id,
|
||||
).values("value_bool")[:1],
|
||||
)
|
||||
|
||||
if not annotation:
|
||||
# Only happens if a new data type is added and not handled here
|
||||
raise ValueError("Invalid custom field data type")
|
||||
|
||||
queryset = (
|
||||
queryset.annotate(
|
||||
# We need to annotate the queryset with the custom field value
|
||||
custom_field_value=annotation,
|
||||
# We also need to annotate the queryset with a boolean for sorting whether the field exists
|
||||
has_field=Exists(
|
||||
CustomFieldInstance.objects.filter(
|
||||
document_id=OuterRef("id"),
|
||||
field_id=custom_field_id,
|
||||
),
|
||||
),
|
||||
)
|
||||
.order_by(
|
||||
"-has_field",
|
||||
param.replace(
|
||||
self.prefix + str(custom_field_id),
|
||||
"custom_field_value",
|
||||
),
|
||||
)
|
||||
.distinct()
|
||||
)
|
||||
|
||||
return super().filter_queryset(request, queryset, view)
|
||||
492
src/documents/index.py
Normal file
492
src/documents/index.py
Normal file
@@ -0,0 +1,492 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import math
|
||||
import re
|
||||
from collections import Counter
|
||||
from contextlib import contextmanager
|
||||
from datetime import datetime
|
||||
from datetime import time
|
||||
from datetime import timedelta
|
||||
from datetime import timezone
|
||||
from shutil import rmtree
|
||||
from typing import TYPE_CHECKING
|
||||
from typing import Literal
|
||||
|
||||
from django.conf import settings
|
||||
from django.utils import timezone as django_timezone
|
||||
from django.utils.timezone import get_current_timezone
|
||||
from django.utils.timezone import now
|
||||
from guardian.shortcuts import get_users_with_perms
|
||||
from whoosh import classify
|
||||
from whoosh import highlight
|
||||
from whoosh import query
|
||||
from whoosh.fields import BOOLEAN
|
||||
from whoosh.fields import DATETIME
|
||||
from whoosh.fields import KEYWORD
|
||||
from whoosh.fields import NUMERIC
|
||||
from whoosh.fields import TEXT
|
||||
from whoosh.fields import Schema
|
||||
from whoosh.highlight import HtmlFormatter
|
||||
from whoosh.idsets import BitSet
|
||||
from whoosh.idsets import DocIdSet
|
||||
from whoosh.index import FileIndex
|
||||
from whoosh.index import create_in
|
||||
from whoosh.index import exists_in
|
||||
from whoosh.index import open_dir
|
||||
from whoosh.qparser import MultifieldParser
|
||||
from whoosh.qparser import QueryParser
|
||||
from whoosh.qparser.dateparse import DateParserPlugin
|
||||
from whoosh.qparser.dateparse import English
|
||||
from whoosh.qparser.plugins import FieldsPlugin
|
||||
from whoosh.scoring import TF_IDF
|
||||
from whoosh.util.times import timespan
|
||||
from whoosh.writing import AsyncWriter
|
||||
|
||||
from documents.models import CustomFieldInstance
|
||||
from documents.models import Document
|
||||
from documents.models import Note
|
||||
from documents.models import User
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from django.db.models import QuerySet
|
||||
from whoosh.reading import IndexReader
|
||||
from whoosh.searching import ResultsPage
|
||||
from whoosh.searching import Searcher
|
||||
|
||||
logger = logging.getLogger("paperless.index")
|
||||
|
||||
|
||||
def get_schema() -> Schema:
|
||||
return Schema(
|
||||
id=NUMERIC(stored=True, unique=True),
|
||||
title=TEXT(sortable=True),
|
||||
content=TEXT(),
|
||||
asn=NUMERIC(sortable=True, signed=False),
|
||||
correspondent=TEXT(sortable=True),
|
||||
correspondent_id=NUMERIC(),
|
||||
has_correspondent=BOOLEAN(),
|
||||
tag=KEYWORD(commas=True, scorable=True, lowercase=True),
|
||||
tag_id=KEYWORD(commas=True, scorable=True),
|
||||
has_tag=BOOLEAN(),
|
||||
type=TEXT(sortable=True),
|
||||
type_id=NUMERIC(),
|
||||
has_type=BOOLEAN(),
|
||||
created=DATETIME(sortable=True),
|
||||
modified=DATETIME(sortable=True),
|
||||
added=DATETIME(sortable=True),
|
||||
path=TEXT(sortable=True),
|
||||
path_id=NUMERIC(),
|
||||
has_path=BOOLEAN(),
|
||||
notes=TEXT(),
|
||||
num_notes=NUMERIC(sortable=True, signed=False),
|
||||
custom_fields=TEXT(),
|
||||
custom_field_count=NUMERIC(sortable=True, signed=False),
|
||||
has_custom_fields=BOOLEAN(),
|
||||
custom_fields_id=KEYWORD(commas=True),
|
||||
owner=TEXT(),
|
||||
owner_id=NUMERIC(),
|
||||
has_owner=BOOLEAN(),
|
||||
viewer_id=KEYWORD(commas=True),
|
||||
checksum=TEXT(),
|
||||
page_count=NUMERIC(sortable=True),
|
||||
original_filename=TEXT(sortable=True),
|
||||
is_shared=BOOLEAN(),
|
||||
)
|
||||
|
||||
|
||||
def open_index(*, recreate=False) -> FileIndex:
|
||||
try:
|
||||
if exists_in(settings.INDEX_DIR) and not recreate:
|
||||
return open_dir(settings.INDEX_DIR, schema=get_schema())
|
||||
except Exception:
|
||||
logger.exception("Error while opening the index, recreating.")
|
||||
|
||||
# create_in doesn't handle corrupted indexes very well, remove the directory entirely first
|
||||
if settings.INDEX_DIR.is_dir():
|
||||
rmtree(settings.INDEX_DIR)
|
||||
settings.INDEX_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
return create_in(settings.INDEX_DIR, get_schema())
|
||||
|
||||
|
||||
@contextmanager
|
||||
def open_index_writer(*, optimize=False) -> AsyncWriter:
|
||||
writer = AsyncWriter(open_index())
|
||||
|
||||
try:
|
||||
yield writer
|
||||
except Exception as e:
|
||||
logger.exception(str(e))
|
||||
writer.cancel()
|
||||
finally:
|
||||
writer.commit(optimize=optimize)
|
||||
|
||||
|
||||
@contextmanager
|
||||
def open_index_searcher() -> Searcher:
|
||||
searcher = open_index().searcher()
|
||||
|
||||
try:
|
||||
yield searcher
|
||||
finally:
|
||||
searcher.close()
|
||||
|
||||
|
||||
def update_document(writer: AsyncWriter, doc: Document) -> None:
|
||||
tags = ",".join([t.name for t in doc.tags.all()])
|
||||
tags_ids = ",".join([str(t.id) for t in doc.tags.all()])
|
||||
notes = ",".join([str(c.note) for c in Note.objects.filter(document=doc)])
|
||||
custom_fields = ",".join(
|
||||
[str(c) for c in CustomFieldInstance.objects.filter(document=doc)],
|
||||
)
|
||||
custom_fields_ids = ",".join(
|
||||
[str(f.field.id) for f in CustomFieldInstance.objects.filter(document=doc)],
|
||||
)
|
||||
asn: int | None = doc.archive_serial_number
|
||||
if asn is not None and (
|
||||
asn < Document.ARCHIVE_SERIAL_NUMBER_MIN
|
||||
or asn > Document.ARCHIVE_SERIAL_NUMBER_MAX
|
||||
):
|
||||
logger.error(
|
||||
f"Not indexing Archive Serial Number {asn} of document {doc.pk}. "
|
||||
f"ASN is out of range "
|
||||
f"[{Document.ARCHIVE_SERIAL_NUMBER_MIN:,}, "
|
||||
f"{Document.ARCHIVE_SERIAL_NUMBER_MAX:,}.",
|
||||
)
|
||||
asn = 0
|
||||
users_with_perms = get_users_with_perms(
|
||||
doc,
|
||||
only_with_perms_in=["view_document"],
|
||||
)
|
||||
viewer_ids: str = ",".join([str(u.id) for u in users_with_perms])
|
||||
writer.update_document(
|
||||
id=doc.pk,
|
||||
title=doc.title,
|
||||
content=doc.content,
|
||||
correspondent=doc.correspondent.name if doc.correspondent else None,
|
||||
correspondent_id=doc.correspondent.id if doc.correspondent else None,
|
||||
has_correspondent=doc.correspondent is not None,
|
||||
tag=tags if tags else None,
|
||||
tag_id=tags_ids if tags_ids else None,
|
||||
has_tag=len(tags) > 0,
|
||||
type=doc.document_type.name if doc.document_type else None,
|
||||
type_id=doc.document_type.id if doc.document_type else None,
|
||||
has_type=doc.document_type is not None,
|
||||
created=datetime.combine(doc.created, time.min),
|
||||
added=doc.added,
|
||||
asn=asn,
|
||||
modified=doc.modified,
|
||||
path=doc.storage_path.name if doc.storage_path else None,
|
||||
path_id=doc.storage_path.id if doc.storage_path else None,
|
||||
has_path=doc.storage_path is not None,
|
||||
notes=notes,
|
||||
num_notes=len(notes),
|
||||
custom_fields=custom_fields,
|
||||
custom_field_count=len(doc.custom_fields.all()),
|
||||
has_custom_fields=len(custom_fields) > 0,
|
||||
custom_fields_id=custom_fields_ids if custom_fields_ids else None,
|
||||
owner=doc.owner.username if doc.owner else None,
|
||||
owner_id=doc.owner.id if doc.owner else None,
|
||||
has_owner=doc.owner is not None,
|
||||
viewer_id=viewer_ids if viewer_ids else None,
|
||||
checksum=doc.checksum,
|
||||
page_count=doc.page_count,
|
||||
original_filename=doc.original_filename,
|
||||
is_shared=len(viewer_ids) > 0,
|
||||
)
|
||||
logger.debug(f"Index updated for document {doc.pk}.")
|
||||
|
||||
|
||||
def remove_document(writer: AsyncWriter, doc: Document) -> None:
|
||||
remove_document_by_id(writer, doc.pk)
|
||||
|
||||
|
||||
def remove_document_by_id(writer: AsyncWriter, doc_id) -> None:
|
||||
writer.delete_by_term("id", doc_id)
|
||||
|
||||
|
||||
def add_or_update_document(document: Document) -> None:
|
||||
with open_index_writer() as writer:
|
||||
update_document(writer, document)
|
||||
|
||||
|
||||
def remove_document_from_index(document: Document) -> None:
|
||||
with open_index_writer() as writer:
|
||||
remove_document(writer, document)
|
||||
|
||||
|
||||
class MappedDocIdSet(DocIdSet):
|
||||
"""
|
||||
A DocIdSet backed by a set of `Document` IDs.
|
||||
Supports efficiently looking up if a whoosh docnum is in the provided `filter_queryset`.
|
||||
"""
|
||||
|
||||
def __init__(self, filter_queryset: QuerySet, ixreader: IndexReader) -> None:
|
||||
super().__init__()
|
||||
document_ids = filter_queryset.order_by("id").values_list("id", flat=True)
|
||||
max_id = document_ids.last() or 0
|
||||
self.document_ids = BitSet(document_ids, size=max_id)
|
||||
self.ixreader = ixreader
|
||||
|
||||
def __contains__(self, docnum) -> bool:
|
||||
document_id = self.ixreader.stored_fields(docnum)["id"]
|
||||
return document_id in self.document_ids
|
||||
|
||||
def __bool__(self) -> Literal[True]:
|
||||
# searcher.search ignores a filter if it's "falsy".
|
||||
# We use this hack so this DocIdSet, when used as a filter, is never ignored.
|
||||
return True
|
||||
|
||||
|
||||
class DelayedQuery:
|
||||
def _get_query(self):
|
||||
raise NotImplementedError # pragma: no cover
|
||||
|
||||
def _get_query_sortedby(self) -> tuple[None, Literal[False]] | tuple[str, bool]:
|
||||
if "ordering" not in self.query_params:
|
||||
return None, False
|
||||
|
||||
field: str = self.query_params["ordering"]
|
||||
|
||||
sort_fields_map: dict[str, str] = {
|
||||
"created": "created",
|
||||
"modified": "modified",
|
||||
"added": "added",
|
||||
"title": "title",
|
||||
"correspondent__name": "correspondent",
|
||||
"document_type__name": "type",
|
||||
"archive_serial_number": "asn",
|
||||
"num_notes": "num_notes",
|
||||
"owner": "owner",
|
||||
"page_count": "page_count",
|
||||
}
|
||||
|
||||
if field.startswith("-"):
|
||||
field = field[1:]
|
||||
reverse = True
|
||||
else:
|
||||
reverse = False
|
||||
|
||||
if field not in sort_fields_map:
|
||||
return None, False
|
||||
else:
|
||||
return sort_fields_map[field], reverse
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
searcher: Searcher,
|
||||
query_params,
|
||||
page_size,
|
||||
filter_queryset: QuerySet,
|
||||
) -> None:
|
||||
self.searcher = searcher
|
||||
self.query_params = query_params
|
||||
self.page_size = page_size
|
||||
self.saved_results = dict()
|
||||
self.first_score = None
|
||||
self.filter_queryset = filter_queryset
|
||||
self.suggested_correction = None
|
||||
|
||||
def __len__(self) -> int:
|
||||
page = self[0:1]
|
||||
return len(page)
|
||||
|
||||
def __getitem__(self, item):
|
||||
if item.start in self.saved_results:
|
||||
return self.saved_results[item.start]
|
||||
|
||||
q, mask, suggested_correction = self._get_query()
|
||||
self.suggested_correction = suggested_correction
|
||||
sortedby, reverse = self._get_query_sortedby()
|
||||
|
||||
page: ResultsPage = self.searcher.search_page(
|
||||
q,
|
||||
mask=mask,
|
||||
filter=MappedDocIdSet(self.filter_queryset, self.searcher.ixreader),
|
||||
pagenum=math.floor(item.start / self.page_size) + 1,
|
||||
pagelen=self.page_size,
|
||||
sortedby=sortedby,
|
||||
reverse=reverse,
|
||||
)
|
||||
page.results.fragmenter = highlight.ContextFragmenter(surround=50)
|
||||
page.results.formatter = HtmlFormatter(tagname="span", between=" ... ")
|
||||
|
||||
if not self.first_score and len(page.results) > 0 and sortedby is None:
|
||||
self.first_score = page.results[0].score
|
||||
|
||||
page.results.top_n = list(
|
||||
map(
|
||||
lambda hit: (
|
||||
(hit[0] / self.first_score) if self.first_score else None,
|
||||
hit[1],
|
||||
),
|
||||
page.results.top_n,
|
||||
),
|
||||
)
|
||||
|
||||
self.saved_results[item.start] = page
|
||||
|
||||
return page
|
||||
|
||||
|
||||
class LocalDateParser(English):
|
||||
def reverse_timezone_offset(self, d):
|
||||
return (d.replace(tzinfo=django_timezone.get_current_timezone())).astimezone(
|
||||
timezone.utc,
|
||||
)
|
||||
|
||||
def date_from(self, *args, **kwargs):
|
||||
d = super().date_from(*args, **kwargs)
|
||||
if isinstance(d, timespan):
|
||||
d.start = self.reverse_timezone_offset(d.start)
|
||||
d.end = self.reverse_timezone_offset(d.end)
|
||||
elif isinstance(d, datetime):
|
||||
d = self.reverse_timezone_offset(d)
|
||||
return d
|
||||
|
||||
|
||||
class DelayedFullTextQuery(DelayedQuery):
|
||||
def _get_query(self) -> tuple:
|
||||
q_str = self.query_params["query"]
|
||||
q_str = rewrite_natural_date_keywords(q_str)
|
||||
qp = MultifieldParser(
|
||||
[
|
||||
"content",
|
||||
"title",
|
||||
"correspondent",
|
||||
"tag",
|
||||
"type",
|
||||
"notes",
|
||||
"custom_fields",
|
||||
],
|
||||
self.searcher.ixreader.schema,
|
||||
)
|
||||
qp.add_plugin(
|
||||
DateParserPlugin(
|
||||
basedate=django_timezone.now(),
|
||||
dateparser=LocalDateParser(),
|
||||
),
|
||||
)
|
||||
q = qp.parse(q_str)
|
||||
suggested_correction = None
|
||||
try:
|
||||
corrected = self.searcher.correct_query(q, q_str)
|
||||
if corrected.string != q_str:
|
||||
suggested_correction = corrected.string
|
||||
except Exception as e:
|
||||
logger.info(
|
||||
"Error while correcting query %s: %s",
|
||||
f"{q_str!r}",
|
||||
e,
|
||||
)
|
||||
|
||||
return q, None, suggested_correction
|
||||
|
||||
|
||||
class DelayedMoreLikeThisQuery(DelayedQuery):
|
||||
def _get_query(self) -> tuple:
|
||||
more_like_doc_id = int(self.query_params["more_like_id"])
|
||||
content = Document.objects.get(id=more_like_doc_id).content
|
||||
|
||||
docnum = self.searcher.document_number(id=more_like_doc_id)
|
||||
kts = self.searcher.key_terms_from_text(
|
||||
"content",
|
||||
content,
|
||||
numterms=20,
|
||||
model=classify.Bo1Model,
|
||||
normalize=False,
|
||||
)
|
||||
q = query.Or(
|
||||
[query.Term("content", word, boost=weight) for word, weight in kts],
|
||||
)
|
||||
mask: set = {docnum}
|
||||
|
||||
return q, mask, None
|
||||
|
||||
|
||||
def autocomplete(
|
||||
ix: FileIndex,
|
||||
term: str,
|
||||
limit: int = 10,
|
||||
user: User | None = None,
|
||||
) -> list:
|
||||
"""
|
||||
Mimics whoosh.reading.IndexReader.most_distinctive_terms with permissions
|
||||
and without scoring
|
||||
"""
|
||||
terms = []
|
||||
|
||||
with ix.searcher(weighting=TF_IDF()) as s:
|
||||
qp = QueryParser("content", schema=ix.schema)
|
||||
# Don't let searches with a query that happen to match a field override the
|
||||
# content field query instead and return bogus, not text data
|
||||
qp.remove_plugin_class(FieldsPlugin)
|
||||
q = qp.parse(f"{term.lower()}*")
|
||||
user_criterias: list = get_permissions_criterias(user)
|
||||
|
||||
results = s.search(
|
||||
q,
|
||||
terms=True,
|
||||
filter=query.Or(user_criterias) if user_criterias is not None else None,
|
||||
)
|
||||
|
||||
termCounts = Counter()
|
||||
if results.has_matched_terms():
|
||||
for hit in results:
|
||||
for _, match in hit.matched_terms():
|
||||
termCounts[match] += 1
|
||||
terms = [t for t, _ in termCounts.most_common(limit)]
|
||||
|
||||
term_encoded: bytes = term.encode("UTF-8")
|
||||
if term_encoded in terms:
|
||||
terms.insert(0, terms.pop(terms.index(term_encoded)))
|
||||
|
||||
return terms
|
||||
|
||||
|
||||
def get_permissions_criterias(user: User | None = None) -> list:
|
||||
user_criterias = [query.Term("has_owner", text=False)]
|
||||
if user is not None:
|
||||
if user.is_superuser: # superusers see all docs
|
||||
user_criterias = []
|
||||
else:
|
||||
user_criterias.append(query.Term("owner_id", user.id))
|
||||
user_criterias.append(
|
||||
query.Term("viewer_id", str(user.id)),
|
||||
)
|
||||
return user_criterias
|
||||
|
||||
|
||||
def rewrite_natural_date_keywords(query_string: str) -> str:
|
||||
"""
|
||||
Rewrites natural date keywords (e.g. added:today or added:"yesterday") to UTC range syntax for Whoosh.
|
||||
"""
|
||||
|
||||
tz = get_current_timezone()
|
||||
local_now = now().astimezone(tz)
|
||||
|
||||
today = local_now.date()
|
||||
yesterday = today - timedelta(days=1)
|
||||
|
||||
ranges = {
|
||||
"today": (
|
||||
datetime.combine(today, time.min, tzinfo=tz),
|
||||
datetime.combine(today, time.max, tzinfo=tz),
|
||||
),
|
||||
"yesterday": (
|
||||
datetime.combine(yesterday, time.min, tzinfo=tz),
|
||||
datetime.combine(yesterday, time.max, tzinfo=tz),
|
||||
),
|
||||
}
|
||||
|
||||
pattern = r"(\b(?:added|created))\s*:\s*[\"']?(today|yesterday)[\"']?"
|
||||
|
||||
def repl(m):
|
||||
field, keyword = m.group(1), m.group(2)
|
||||
start, end = ranges[keyword]
|
||||
start_str = start.astimezone(timezone.utc).strftime("%Y%m%d%H%M%S")
|
||||
end_str = end.astimezone(timezone.utc).strftime("%Y%m%d%H%M%S")
|
||||
return f"{field}:[{start_str} TO {end_str}]"
|
||||
|
||||
return re.sub(pattern, repl, query_string)
|
||||
15
src/documents/loggers.py
Normal file
15
src/documents/loggers.py
Normal file
@@ -0,0 +1,15 @@
|
||||
import logging
|
||||
import uuid
|
||||
|
||||
|
||||
class LoggingMixin:
|
||||
def renew_logging_group(self):
|
||||
"""
|
||||
Creates a new UUID to group subsequent log calls together with
|
||||
the extra data named group
|
||||
"""
|
||||
self.logging_group = uuid.uuid4()
|
||||
self.log = logging.LoggerAdapter(
|
||||
logging.getLogger(self.logging_name),
|
||||
extra={"group": self.logging_group},
|
||||
)
|
||||
38
src/documents/mail.py
Normal file
38
src/documents/mail.py
Normal file
@@ -0,0 +1,38 @@
|
||||
from email import message_from_bytes
|
||||
from pathlib import Path
|
||||
|
||||
from django.conf import settings
|
||||
from django.core.mail import EmailMessage
|
||||
from filelock import FileLock
|
||||
|
||||
|
||||
def send_email(
|
||||
subject: str,
|
||||
body: str,
|
||||
to: list[str],
|
||||
attachment: Path | None = None,
|
||||
attachment_mime_type: str | None = None,
|
||||
) -> int:
|
||||
"""
|
||||
Send an email with an optional attachment.
|
||||
TODO: re-evaluate this pending https://code.djangoproject.com/ticket/35581 / https://github.com/django/django/pull/18966
|
||||
"""
|
||||
email = EmailMessage(
|
||||
subject=subject,
|
||||
body=body,
|
||||
to=to,
|
||||
)
|
||||
if attachment:
|
||||
# Something could be renaming the file concurrently so it can't be attached
|
||||
with FileLock(settings.MEDIA_LOCK), attachment.open("rb") as f:
|
||||
content = f.read()
|
||||
if attachment_mime_type == "message/rfc822":
|
||||
# See https://forum.djangoproject.com/t/using-emailmessage-with-an-attached-email-file-crashes-due-to-non-ascii/37981
|
||||
content = message_from_bytes(f.read())
|
||||
|
||||
email.attach(
|
||||
filename=attachment.name,
|
||||
content=content,
|
||||
mimetype=attachment_mime_type,
|
||||
)
|
||||
return email.send()
|
||||
0
src/documents/management/__init__.py
Normal file
0
src/documents/management/__init__.py
Normal file
0
src/documents/management/commands/__init__.py
Normal file
0
src/documents/management/commands/__init__.py
Normal file
36
src/documents/management/commands/convert_mariadb_uuid.py
Normal file
36
src/documents/management/commands/convert_mariadb_uuid.py
Normal file
@@ -0,0 +1,36 @@
|
||||
from django.core.management.base import BaseCommand
|
||||
from django.db import connection
|
||||
from django.db import models
|
||||
|
||||
from documents.models import Document
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
# This code is taken almost entirely from https://github.com/wagtail/wagtail/pull/11912 with all credit to the original author.
|
||||
help = "Converts UUID columns from char type to the native UUID type used in MariaDB 10.7+ and Django 5.0+."
|
||||
|
||||
def convert_field(self, model, field_name, *, null=False):
|
||||
if model._meta.get_field(field_name).model != model: # pragma: no cover
|
||||
# Field is inherited from a parent model
|
||||
return
|
||||
|
||||
if not model._meta.managed: # pragma: no cover
|
||||
# The migration framework skips unmanaged models, so we should too
|
||||
return
|
||||
|
||||
old_field = models.CharField(null=null, max_length=36)
|
||||
old_field.set_attributes_from_name(field_name)
|
||||
|
||||
new_field = models.UUIDField(null=null)
|
||||
new_field.set_attributes_from_name(field_name)
|
||||
|
||||
with connection.schema_editor() as schema_editor:
|
||||
schema_editor.alter_field(model, old_field, new_field)
|
||||
self.stdout.write(
|
||||
self.style.SUCCESS(
|
||||
f"Successfully converted {model._meta.label} {field_name} field to UUID type.",
|
||||
),
|
||||
)
|
||||
|
||||
def handle(self, **options):
|
||||
self.convert_field(Document, "transaction_id", null=True)
|
||||
93
src/documents/management/commands/decrypt_documents.py
Normal file
93
src/documents/management/commands/decrypt_documents.py
Normal file
@@ -0,0 +1,93 @@
|
||||
from pathlib import Path
|
||||
|
||||
from django.conf import settings
|
||||
from django.core.management.base import BaseCommand
|
||||
from django.core.management.base import CommandError
|
||||
|
||||
from documents.models import Document
|
||||
from paperless.db import GnuPG
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = (
|
||||
"This is how you migrate your stored documents from an encrypted "
|
||||
"state to an unencrypted one (or vice-versa)"
|
||||
)
|
||||
|
||||
def add_arguments(self, parser) -> None:
|
||||
parser.add_argument(
|
||||
"--passphrase",
|
||||
help=(
|
||||
"If PAPERLESS_PASSPHRASE isn't set already, you need to specify it here"
|
||||
),
|
||||
)
|
||||
|
||||
def handle(self, *args, **options) -> None:
|
||||
try:
|
||||
self.stdout.write(
|
||||
self.style.WARNING(
|
||||
"\n\n"
|
||||
"WARNING: This script is going to work directly on your "
|
||||
"document originals, so\n"
|
||||
"WARNING: you probably shouldn't run "
|
||||
"this unless you've got a recent backup\n"
|
||||
"WARNING: handy. It "
|
||||
"*should* work without a hitch, but be safe and backup your\n"
|
||||
"WARNING: stuff first.\n\n"
|
||||
"Hit Ctrl+C to exit now, or Enter to "
|
||||
"continue.\n\n",
|
||||
),
|
||||
)
|
||||
_ = input()
|
||||
except KeyboardInterrupt:
|
||||
return
|
||||
|
||||
passphrase = options["passphrase"] or settings.PASSPHRASE
|
||||
if not passphrase:
|
||||
raise CommandError(
|
||||
"Passphrase not defined. Please set it with --passphrase or "
|
||||
"by declaring it in your environment or your config.",
|
||||
)
|
||||
|
||||
self.__gpg_to_unencrypted(passphrase)
|
||||
|
||||
def __gpg_to_unencrypted(self, passphrase: str) -> None:
|
||||
encrypted_files = Document.objects.filter(
|
||||
storage_type=Document.STORAGE_TYPE_GPG,
|
||||
)
|
||||
|
||||
for document in encrypted_files:
|
||||
self.stdout.write(f"Decrypting {document}")
|
||||
|
||||
old_paths = [document.source_path, document.thumbnail_path]
|
||||
|
||||
with document.source_file as file_handle:
|
||||
raw_document = GnuPG.decrypted(file_handle, passphrase)
|
||||
with document.thumbnail_file as file_handle:
|
||||
raw_thumb = GnuPG.decrypted(file_handle, passphrase)
|
||||
|
||||
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
|
||||
ext: str = Path(document.filename).suffix
|
||||
|
||||
if not ext == ".gpg":
|
||||
raise CommandError(
|
||||
f"Abort: encrypted file {document.source_path} does not "
|
||||
f"end with .gpg",
|
||||
)
|
||||
|
||||
document.filename = Path(document.filename).stem
|
||||
|
||||
with document.source_path.open("wb") as f:
|
||||
f.write(raw_document)
|
||||
|
||||
with document.thumbnail_path.open("wb") as f:
|
||||
f.write(raw_thumb)
|
||||
|
||||
Document.objects.filter(id=document.id).update(
|
||||
storage_type=document.storage_type,
|
||||
filename=document.filename,
|
||||
)
|
||||
|
||||
for path in old_paths:
|
||||
path.unlink()
|
||||
94
src/documents/management/commands/document_archiver.py
Normal file
94
src/documents/management/commands/document_archiver.py
Normal file
@@ -0,0 +1,94 @@
|
||||
import logging
|
||||
import multiprocessing
|
||||
|
||||
import tqdm
|
||||
from django import db
|
||||
from django.conf import settings
|
||||
from django.core.management.base import BaseCommand
|
||||
|
||||
from documents.management.commands.mixins import MultiProcessMixin
|
||||
from documents.management.commands.mixins import ProgressBarMixin
|
||||
from documents.models import Document
|
||||
from documents.tasks import update_document_content_maybe_archive_file
|
||||
|
||||
logger = logging.getLogger("paperless.management.archiver")
|
||||
|
||||
|
||||
class Command(MultiProcessMixin, ProgressBarMixin, BaseCommand):
|
||||
help = (
|
||||
"Using the current classification model, assigns correspondents, tags "
|
||||
"and document types to all documents, effectively allowing you to "
|
||||
"back-tag all previously indexed documents with metadata created (or "
|
||||
"modified) after their initial import."
|
||||
)
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument(
|
||||
"-f",
|
||||
"--overwrite",
|
||||
default=False,
|
||||
action="store_true",
|
||||
help=(
|
||||
"Recreates the archived document for documents that already "
|
||||
"have an archived version."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"-d",
|
||||
"--document",
|
||||
default=None,
|
||||
type=int,
|
||||
required=False,
|
||||
help=(
|
||||
"Specify the ID of a document, and this command will only "
|
||||
"run on this specific document."
|
||||
),
|
||||
)
|
||||
self.add_argument_progress_bar_mixin(parser)
|
||||
self.add_argument_processes_mixin(parser)
|
||||
|
||||
def handle(self, *args, **options):
|
||||
self.handle_processes_mixin(**options)
|
||||
self.handle_progress_bar_mixin(**options)
|
||||
|
||||
settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
overwrite = options["overwrite"]
|
||||
|
||||
if options["document"]:
|
||||
documents = Document.objects.filter(pk=options["document"])
|
||||
else:
|
||||
documents = Document.objects.all()
|
||||
|
||||
document_ids = list(
|
||||
map(
|
||||
lambda doc: doc.id,
|
||||
filter(lambda d: overwrite or not d.has_archive_version, documents),
|
||||
),
|
||||
)
|
||||
|
||||
# Note to future self: this prevents django from reusing database
|
||||
# connections between processes, which is bad and does not work
|
||||
# with postgres.
|
||||
db.connections.close_all()
|
||||
|
||||
try:
|
||||
logging.getLogger().handlers[0].level = logging.ERROR
|
||||
|
||||
if self.process_count == 1:
|
||||
for doc_id in document_ids:
|
||||
update_document_content_maybe_archive_file(doc_id)
|
||||
else: # pragma: no cover
|
||||
with multiprocessing.Pool(self.process_count) as pool:
|
||||
list(
|
||||
tqdm.tqdm(
|
||||
pool.imap_unordered(
|
||||
update_document_content_maybe_archive_file,
|
||||
document_ids,
|
||||
),
|
||||
total=len(document_ids),
|
||||
disable=self.no_progress_bar,
|
||||
),
|
||||
)
|
||||
except KeyboardInterrupt:
|
||||
self.stdout.write(self.style.NOTICE("Aborting..."))
|
||||
365
src/documents/management/commands/document_consumer.py
Normal file
365
src/documents/management/commands/document_consumer.py
Normal file
@@ -0,0 +1,365 @@
|
||||
import logging
|
||||
import os
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from fnmatch import filter
|
||||
from pathlib import Path
|
||||
from pathlib import PurePath
|
||||
from threading import Event
|
||||
from time import monotonic
|
||||
from time import sleep
|
||||
from typing import Final
|
||||
|
||||
from django import db
|
||||
from django.conf import settings
|
||||
from django.core.management.base import BaseCommand
|
||||
from django.core.management.base import CommandError
|
||||
from watchdog.events import FileSystemEventHandler
|
||||
from watchdog.observers.polling import PollingObserver
|
||||
|
||||
from documents.data_models import ConsumableDocument
|
||||
from documents.data_models import DocumentMetadataOverrides
|
||||
from documents.data_models import DocumentSource
|
||||
from documents.models import Tag
|
||||
from documents.parsers import is_file_ext_supported
|
||||
from documents.tasks import consume_file
|
||||
|
||||
try:
|
||||
from inotifyrecursive import INotify
|
||||
from inotifyrecursive import flags
|
||||
except ImportError: # pragma: no cover
|
||||
INotify = flags = None
|
||||
|
||||
logger = logging.getLogger("paperless.management.consumer")
|
||||
|
||||
|
||||
def _tags_from_path(filepath: Path) -> list[int]:
|
||||
"""
|
||||
Walk up the directory tree from filepath to CONSUMPTION_DIR
|
||||
and get or create Tag IDs for every directory.
|
||||
|
||||
Returns set of Tag models
|
||||
"""
|
||||
db.close_old_connections()
|
||||
tag_ids = set()
|
||||
path_parts = filepath.relative_to(settings.CONSUMPTION_DIR).parent.parts
|
||||
for part in path_parts:
|
||||
tag_ids.add(
|
||||
Tag.objects.get_or_create(name__iexact=part, defaults={"name": part})[0].pk,
|
||||
)
|
||||
|
||||
return list(tag_ids)
|
||||
|
||||
|
||||
def _is_ignored(filepath: Path) -> bool:
|
||||
"""
|
||||
Checks if the given file should be ignored, based on configured
|
||||
patterns.
|
||||
|
||||
Returns True if the file is ignored, False otherwise
|
||||
"""
|
||||
# Trim out the consume directory, leaving only filename and it's
|
||||
# path relative to the consume directory
|
||||
filepath_relative = PurePath(filepath).relative_to(settings.CONSUMPTION_DIR)
|
||||
|
||||
# March through the components of the path, including directories and the filename
|
||||
# looking for anything matching
|
||||
# foo/bar/baz/file.pdf -> (foo, bar, baz, file.pdf)
|
||||
parts = []
|
||||
for part in filepath_relative.parts:
|
||||
# If the part is not the name (ie, it's a dir)
|
||||
# Need to append the trailing slash or fnmatch doesn't match
|
||||
# fnmatch("dir", "dir/*") == False
|
||||
# fnmatch("dir/", "dir/*") == True
|
||||
if part != filepath_relative.name:
|
||||
part = part + "/"
|
||||
parts.append(part)
|
||||
|
||||
for pattern in settings.CONSUMER_IGNORE_PATTERNS:
|
||||
if len(filter(parts, pattern)):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def _consume(filepath: Path) -> None:
|
||||
# Check permissions early
|
||||
try:
|
||||
filepath.stat()
|
||||
except (PermissionError, OSError):
|
||||
logger.warning(f"Not consuming file {filepath}: Permission denied.")
|
||||
return
|
||||
|
||||
if filepath.is_dir() or _is_ignored(filepath):
|
||||
return
|
||||
|
||||
if not filepath.is_file():
|
||||
logger.debug(f"Not consuming file {filepath}: File has moved.")
|
||||
return
|
||||
|
||||
if not is_file_ext_supported(filepath.suffix):
|
||||
logger.warning(f"Not consuming file {filepath}: Unknown file extension.")
|
||||
return
|
||||
|
||||
# Total wait time: up to 500ms
|
||||
os_error_retry_count: Final[int] = 50
|
||||
os_error_retry_wait: Final[float] = 0.01
|
||||
|
||||
read_try_count = 0
|
||||
file_open_ok = False
|
||||
os_error_str = None
|
||||
|
||||
while (read_try_count < os_error_retry_count) and not file_open_ok:
|
||||
try:
|
||||
with filepath.open("rb"):
|
||||
file_open_ok = True
|
||||
except OSError as e:
|
||||
read_try_count += 1
|
||||
os_error_str = str(e)
|
||||
sleep(os_error_retry_wait)
|
||||
|
||||
if read_try_count >= os_error_retry_count:
|
||||
logger.warning(f"Not consuming file {filepath}: OS reports {os_error_str}")
|
||||
return
|
||||
|
||||
tag_ids = None
|
||||
try:
|
||||
if settings.CONSUMER_SUBDIRS_AS_TAGS:
|
||||
tag_ids = _tags_from_path(filepath)
|
||||
except Exception:
|
||||
logger.exception("Error creating tags from path")
|
||||
|
||||
try:
|
||||
logger.info(f"Adding {filepath} to the task queue.")
|
||||
consume_file.delay(
|
||||
ConsumableDocument(
|
||||
source=DocumentSource.ConsumeFolder,
|
||||
original_file=filepath,
|
||||
),
|
||||
DocumentMetadataOverrides(tag_ids=tag_ids),
|
||||
)
|
||||
except Exception:
|
||||
# Catch all so that the consumer won't crash.
|
||||
# This is also what the test case is listening for to check for
|
||||
# errors.
|
||||
logger.exception("Error while consuming document")
|
||||
|
||||
|
||||
def _consume_wait_unmodified(file: Path) -> None:
|
||||
"""
|
||||
Waits for the given file to appear unmodified based on file size
|
||||
and modification time. Will wait a configured number of seconds
|
||||
and retry a configured number of times before either consuming or
|
||||
giving up
|
||||
"""
|
||||
if _is_ignored(file):
|
||||
return
|
||||
|
||||
logger.debug(f"Waiting for file {file} to remain unmodified")
|
||||
mtime = -1
|
||||
size = -1
|
||||
current_try = 0
|
||||
while current_try < settings.CONSUMER_POLLING_RETRY_COUNT:
|
||||
try:
|
||||
stat_data = file.stat()
|
||||
new_mtime = stat_data.st_mtime
|
||||
new_size = stat_data.st_size
|
||||
except FileNotFoundError:
|
||||
logger.debug(
|
||||
f"File {file} moved while waiting for it to remain unmodified.",
|
||||
)
|
||||
return
|
||||
if new_mtime == mtime and new_size == size:
|
||||
_consume(file)
|
||||
return
|
||||
mtime = new_mtime
|
||||
size = new_size
|
||||
sleep(settings.CONSUMER_POLLING_DELAY)
|
||||
current_try += 1
|
||||
|
||||
logger.error(f"Timeout while waiting on file {file} to remain unmodified.")
|
||||
|
||||
|
||||
class Handler(FileSystemEventHandler):
|
||||
def __init__(self, pool: ThreadPoolExecutor) -> None:
|
||||
super().__init__()
|
||||
self._pool = pool
|
||||
|
||||
def on_created(self, event):
|
||||
self._pool.submit(_consume_wait_unmodified, Path(event.src_path))
|
||||
|
||||
def on_moved(self, event):
|
||||
self._pool.submit(_consume_wait_unmodified, Path(event.dest_path))
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
"""
|
||||
On every iteration of an infinite loop, consume what we can from the
|
||||
consumption directory.
|
||||
"""
|
||||
|
||||
# This is here primarily for the tests and is irrelevant in production.
|
||||
stop_flag = Event()
|
||||
# Also only for testing, configures in one place the timeout used before checking
|
||||
# the stop flag
|
||||
testing_timeout_s: Final[float] = 0.5
|
||||
testing_timeout_ms: Final[float] = testing_timeout_s * 1000.0
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument(
|
||||
"directory",
|
||||
default=settings.CONSUMPTION_DIR,
|
||||
nargs="?",
|
||||
help="The consumption directory.",
|
||||
)
|
||||
parser.add_argument("--oneshot", action="store_true", help="Run only once.")
|
||||
|
||||
# Only use during unit testing, will configure a timeout
|
||||
# Leaving it unset or false and the consumer will exit when it
|
||||
# receives SIGINT
|
||||
parser.add_argument(
|
||||
"--testing",
|
||||
action="store_true",
|
||||
help="Flag used only for unit testing",
|
||||
default=False,
|
||||
)
|
||||
|
||||
def handle(self, *args, **options):
|
||||
directory = options["directory"]
|
||||
recursive = settings.CONSUMER_RECURSIVE
|
||||
|
||||
if not directory:
|
||||
raise CommandError("CONSUMPTION_DIR does not appear to be set.")
|
||||
|
||||
directory = Path(directory).resolve()
|
||||
|
||||
if not directory.is_dir():
|
||||
raise CommandError(f"Consumption directory {directory} does not exist")
|
||||
|
||||
# Consumer will need this
|
||||
settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
if recursive:
|
||||
for dirpath, _, filenames in os.walk(directory):
|
||||
for filename in filenames:
|
||||
filepath = Path(dirpath) / filename
|
||||
_consume(filepath)
|
||||
else:
|
||||
for filepath in directory.iterdir():
|
||||
_consume(filepath)
|
||||
|
||||
if options["oneshot"]:
|
||||
return
|
||||
|
||||
if settings.CONSUMER_POLLING == 0 and INotify:
|
||||
self.handle_inotify(directory, recursive, is_testing=options["testing"])
|
||||
else:
|
||||
if INotify is None and settings.CONSUMER_POLLING == 0: # pragma: no cover
|
||||
logger.warning("Using polling as INotify import failed")
|
||||
self.handle_polling(directory, recursive, is_testing=options["testing"])
|
||||
|
||||
logger.debug("Consumer exiting.")
|
||||
|
||||
def handle_polling(self, directory, recursive, *, is_testing: bool):
|
||||
logger.info(f"Polling directory for changes: {directory}")
|
||||
|
||||
timeout = None
|
||||
if is_testing:
|
||||
timeout = self.testing_timeout_s
|
||||
logger.debug(f"Configuring timeout to {timeout}s")
|
||||
|
||||
polling_interval = settings.CONSUMER_POLLING
|
||||
if polling_interval == 0: # pragma: no cover
|
||||
# Only happens if INotify failed to import
|
||||
logger.warning("Using polling of 10s, consider setting this")
|
||||
polling_interval = 10
|
||||
|
||||
with ThreadPoolExecutor(max_workers=4) as pool:
|
||||
observer = PollingObserver(timeout=polling_interval)
|
||||
observer.schedule(Handler(pool), directory, recursive=recursive)
|
||||
observer.start()
|
||||
try:
|
||||
while observer.is_alive():
|
||||
observer.join(timeout)
|
||||
if self.stop_flag.is_set():
|
||||
observer.stop()
|
||||
except KeyboardInterrupt:
|
||||
observer.stop()
|
||||
observer.join()
|
||||
|
||||
def handle_inotify(self, directory, recursive, *, is_testing: bool):
|
||||
logger.info(f"Using inotify to watch directory for changes: {directory}")
|
||||
|
||||
timeout_ms = None
|
||||
if is_testing:
|
||||
timeout_ms = self.testing_timeout_ms
|
||||
logger.debug(f"Configuring timeout to {timeout_ms}ms")
|
||||
|
||||
inotify = INotify()
|
||||
inotify_flags = flags.CLOSE_WRITE | flags.MOVED_TO | flags.MODIFY
|
||||
if recursive:
|
||||
inotify.add_watch_recursive(directory, inotify_flags)
|
||||
else:
|
||||
inotify.add_watch(directory, inotify_flags)
|
||||
|
||||
inotify_debounce_secs: Final[float] = settings.CONSUMER_INOTIFY_DELAY
|
||||
inotify_debounce_ms: Final[int] = inotify_debounce_secs * 1000
|
||||
|
||||
finished = False
|
||||
|
||||
notified_files = {}
|
||||
|
||||
try:
|
||||
while not finished:
|
||||
try:
|
||||
for event in inotify.read(timeout=timeout_ms):
|
||||
path = inotify.get_path(event.wd) if recursive else directory
|
||||
filepath = Path(path) / event.name
|
||||
if flags.MODIFY in flags.from_mask(event.mask):
|
||||
notified_files.pop(filepath, None)
|
||||
else:
|
||||
notified_files[filepath] = monotonic()
|
||||
|
||||
# Check the files against the timeout
|
||||
still_waiting = {}
|
||||
# last_event_time is time of the last inotify event for this file
|
||||
for filepath, last_event_time in notified_files.items():
|
||||
# Current time - last time over the configured timeout
|
||||
waited_long_enough = (
|
||||
monotonic() - last_event_time
|
||||
) > inotify_debounce_secs
|
||||
|
||||
# Also make sure the file exists still, some scanners might write a
|
||||
# temporary file first
|
||||
try:
|
||||
file_still_exists = filepath.exists() and filepath.is_file()
|
||||
except (PermissionError, OSError): # pragma: no cover
|
||||
# If we can't check, let it fail in the _consume function
|
||||
file_still_exists = True
|
||||
continue
|
||||
|
||||
if waited_long_enough and file_still_exists:
|
||||
_consume(filepath)
|
||||
elif file_still_exists:
|
||||
still_waiting[filepath] = last_event_time
|
||||
|
||||
# These files are still waiting to hit the timeout
|
||||
notified_files = still_waiting
|
||||
|
||||
# If files are waiting, need to exit read() to check them
|
||||
# Otherwise, go back to infinite sleep time, but only if not testing
|
||||
if len(notified_files) > 0:
|
||||
timeout_ms = inotify_debounce_ms
|
||||
elif is_testing:
|
||||
timeout_ms = self.testing_timeout_ms
|
||||
else:
|
||||
timeout_ms = None
|
||||
|
||||
if self.stop_flag.is_set():
|
||||
logger.debug("Finishing because event is set")
|
||||
finished = True
|
||||
|
||||
except KeyboardInterrupt:
|
||||
logger.info("Received SIGINT, stopping inotify")
|
||||
finished = True
|
||||
finally:
|
||||
inotify.close()
|
||||
@@ -0,0 +1,13 @@
|
||||
from django.core.management.base import BaseCommand
|
||||
|
||||
from documents.tasks import train_classifier
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = (
|
||||
"Trains the classifier on your data and saves the resulting models to a "
|
||||
"file. The document consumer will then automatically use this new model."
|
||||
)
|
||||
|
||||
def handle(self, *args, **options):
|
||||
train_classifier(scheduled=False)
|
||||
613
src/documents/management/commands/document_exporter.py
Normal file
613
src/documents/management/commands/document_exporter.py
Normal file
@@ -0,0 +1,613 @@
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import tempfile
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import tqdm
|
||||
from allauth.mfa.models import Authenticator
|
||||
from allauth.socialaccount.models import SocialAccount
|
||||
from allauth.socialaccount.models import SocialApp
|
||||
from allauth.socialaccount.models import SocialToken
|
||||
from django.conf import settings
|
||||
from django.contrib.auth.models import Group
|
||||
from django.contrib.auth.models import Permission
|
||||
from django.contrib.auth.models import User
|
||||
from django.contrib.contenttypes.models import ContentType
|
||||
from django.core import serializers
|
||||
from django.core.management.base import BaseCommand
|
||||
from django.core.management.base import CommandError
|
||||
from django.db import transaction
|
||||
from django.utils import timezone
|
||||
from filelock import FileLock
|
||||
from guardian.models import GroupObjectPermission
|
||||
from guardian.models import UserObjectPermission
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from django.db.models import QuerySet
|
||||
|
||||
if settings.AUDIT_LOG_ENABLED:
|
||||
from auditlog.models import LogEntry
|
||||
|
||||
from documents.file_handling import delete_empty_directories
|
||||
from documents.file_handling import generate_filename
|
||||
from documents.management.commands.mixins import CryptMixin
|
||||
from documents.models import Correspondent
|
||||
from documents.models import CustomField
|
||||
from documents.models import CustomFieldInstance
|
||||
from documents.models import Document
|
||||
from documents.models import DocumentType
|
||||
from documents.models import Note
|
||||
from documents.models import SavedView
|
||||
from documents.models import SavedViewFilterRule
|
||||
from documents.models import StoragePath
|
||||
from documents.models import Tag
|
||||
from documents.models import UiSettings
|
||||
from documents.models import Workflow
|
||||
from documents.models import WorkflowAction
|
||||
from documents.models import WorkflowActionEmail
|
||||
from documents.models import WorkflowActionWebhook
|
||||
from documents.models import WorkflowTrigger
|
||||
from documents.settings import EXPORTER_ARCHIVE_NAME
|
||||
from documents.settings import EXPORTER_FILE_NAME
|
||||
from documents.settings import EXPORTER_THUMBNAIL_NAME
|
||||
from documents.utils import copy_file_with_basic_stats
|
||||
from paperless import version
|
||||
from paperless.db import GnuPG
|
||||
from paperless.models import ApplicationConfiguration
|
||||
from paperless_mail.models import MailAccount
|
||||
from paperless_mail.models import MailRule
|
||||
|
||||
|
||||
class Command(CryptMixin, BaseCommand):
|
||||
help = (
|
||||
"Decrypt and rename all files in our collection into a given target "
|
||||
"directory. And include a manifest file containing document data for "
|
||||
"easy import."
|
||||
)
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument("target")
|
||||
|
||||
parser.add_argument(
|
||||
"-c",
|
||||
"--compare-checksums",
|
||||
default=False,
|
||||
action="store_true",
|
||||
help=(
|
||||
"Compare file checksums when determining whether to export "
|
||||
"a file or not. If not specified, file size and time "
|
||||
"modified is used instead."
|
||||
),
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-cj",
|
||||
"--compare-json",
|
||||
default=False,
|
||||
action="store_true",
|
||||
help=(
|
||||
"Compare json file checksums when determining whether to "
|
||||
"export a json file or not (manifest or metadata). "
|
||||
"If not specified, the file is always exported."
|
||||
),
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-d",
|
||||
"--delete",
|
||||
default=False,
|
||||
action="store_true",
|
||||
help=(
|
||||
"After exporting, delete files in the export directory that "
|
||||
"do not belong to the current export, such as files from "
|
||||
"deleted documents."
|
||||
),
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-f",
|
||||
"--use-filename-format",
|
||||
default=False,
|
||||
action="store_true",
|
||||
help=(
|
||||
"Use PAPERLESS_FILENAME_FORMAT for storing files in the "
|
||||
"export directory, if configured."
|
||||
),
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-na",
|
||||
"--no-archive",
|
||||
default=False,
|
||||
action="store_true",
|
||||
help="Avoid exporting archive files",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-nt",
|
||||
"--no-thumbnail",
|
||||
default=False,
|
||||
action="store_true",
|
||||
help="Avoid exporting thumbnail files",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-p",
|
||||
"--use-folder-prefix",
|
||||
default=False,
|
||||
action="store_true",
|
||||
help=(
|
||||
"Export files in dedicated folders according to their nature: "
|
||||
"archive, originals or thumbnails"
|
||||
),
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-sm",
|
||||
"--split-manifest",
|
||||
default=False,
|
||||
action="store_true",
|
||||
help="Export document information in individual manifest json files.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-z",
|
||||
"--zip",
|
||||
default=False,
|
||||
action="store_true",
|
||||
help="Export the documents to a zip file in the given directory",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-zn",
|
||||
"--zip-name",
|
||||
default=f"export-{timezone.localdate().isoformat()}",
|
||||
help="Sets the export zip file name",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--data-only",
|
||||
default=False,
|
||||
action="store_true",
|
||||
help="If set, only the database will be imported, not files",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--no-progress-bar",
|
||||
default=False,
|
||||
action="store_true",
|
||||
help="If set, the progress bar will not be shown",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--passphrase",
|
||||
help="If provided, is used to encrypt sensitive data in the export",
|
||||
)
|
||||
|
||||
def handle(self, *args, **options):
|
||||
self.target = Path(options["target"]).resolve()
|
||||
self.split_manifest: bool = options["split_manifest"]
|
||||
self.compare_checksums: bool = options["compare_checksums"]
|
||||
self.compare_json: bool = options["compare_json"]
|
||||
self.use_filename_format: bool = options["use_filename_format"]
|
||||
self.use_folder_prefix: bool = options["use_folder_prefix"]
|
||||
self.delete: bool = options["delete"]
|
||||
self.no_archive: bool = options["no_archive"]
|
||||
self.no_thumbnail: bool = options["no_thumbnail"]
|
||||
self.zip_export: bool = options["zip"]
|
||||
self.data_only: bool = options["data_only"]
|
||||
self.no_progress_bar: bool = options["no_progress_bar"]
|
||||
self.passphrase: str | None = options.get("passphrase")
|
||||
|
||||
self.files_in_export_dir: set[Path] = set()
|
||||
self.exported_files: set[str] = set()
|
||||
|
||||
# If zipping, save the original target for later and
|
||||
# get a temporary directory for the target instead
|
||||
temp_dir = None
|
||||
self.original_target = self.target
|
||||
if self.zip_export:
|
||||
settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True)
|
||||
temp_dir = tempfile.TemporaryDirectory(
|
||||
dir=settings.SCRATCH_DIR,
|
||||
prefix="paperless-export",
|
||||
)
|
||||
self.target = Path(temp_dir.name).resolve()
|
||||
|
||||
if not self.target.exists():
|
||||
raise CommandError("That path doesn't exist")
|
||||
|
||||
if not self.target.is_dir():
|
||||
raise CommandError("That path isn't a directory")
|
||||
|
||||
if not os.access(self.target, os.W_OK):
|
||||
raise CommandError("That path doesn't appear to be writable")
|
||||
|
||||
try:
|
||||
# Prevent any ongoing changes in the documents
|
||||
with FileLock(settings.MEDIA_LOCK):
|
||||
self.dump()
|
||||
|
||||
# We've written everything to the temporary directory in this case,
|
||||
# now make an archive in the original target, with all files stored
|
||||
if self.zip_export and temp_dir is not None:
|
||||
shutil.make_archive(
|
||||
self.original_target / options["zip_name"],
|
||||
format="zip",
|
||||
root_dir=temp_dir.name,
|
||||
)
|
||||
|
||||
finally:
|
||||
# Always cleanup the temporary directory, if one was created
|
||||
if self.zip_export and temp_dir is not None:
|
||||
temp_dir.cleanup()
|
||||
|
||||
def dump(self):
|
||||
# 1. Take a snapshot of what files exist in the current export folder
|
||||
for x in self.target.glob("**/*"):
|
||||
if x.is_file():
|
||||
self.files_in_export_dir.add(x.resolve())
|
||||
|
||||
# 2. Create manifest, containing all correspondents, types, tags, storage paths
|
||||
# note, documents and ui_settings
|
||||
manifest_key_to_object_query: dict[str, QuerySet] = {
|
||||
"correspondents": Correspondent.objects.all(),
|
||||
"tags": Tag.objects.all(),
|
||||
"document_types": DocumentType.objects.all(),
|
||||
"storage_paths": StoragePath.objects.all(),
|
||||
"mail_accounts": MailAccount.objects.all(),
|
||||
"mail_rules": MailRule.objects.all(),
|
||||
"saved_views": SavedView.objects.all(),
|
||||
"saved_view_filter_rules": SavedViewFilterRule.objects.all(),
|
||||
"groups": Group.objects.all(),
|
||||
"users": User.objects.exclude(
|
||||
username__in=["consumer", "AnonymousUser"],
|
||||
).all(),
|
||||
"ui_settings": UiSettings.objects.all(),
|
||||
"content_types": ContentType.objects.all(),
|
||||
"permissions": Permission.objects.all(),
|
||||
"user_object_permissions": UserObjectPermission.objects.all(),
|
||||
"group_object_permissions": GroupObjectPermission.objects.all(),
|
||||
"workflow_triggers": WorkflowTrigger.objects.all(),
|
||||
"workflow_actions": WorkflowAction.objects.all(),
|
||||
"workflow_email_actions": WorkflowActionEmail.objects.all(),
|
||||
"workflow_webhook_actions": WorkflowActionWebhook.objects.all(),
|
||||
"workflows": Workflow.objects.all(),
|
||||
"custom_fields": CustomField.objects.all(),
|
||||
"custom_field_instances": CustomFieldInstance.objects.all(),
|
||||
"app_configs": ApplicationConfiguration.objects.all(),
|
||||
"notes": Note.objects.all(),
|
||||
"documents": Document.objects.order_by("id").all(),
|
||||
"social_accounts": SocialAccount.objects.all(),
|
||||
"social_apps": SocialApp.objects.all(),
|
||||
"social_tokens": SocialToken.objects.all(),
|
||||
"authenticators": Authenticator.objects.all(),
|
||||
}
|
||||
|
||||
if settings.AUDIT_LOG_ENABLED:
|
||||
manifest_key_to_object_query["log_entries"] = LogEntry.objects.all()
|
||||
|
||||
with transaction.atomic():
|
||||
manifest_dict = {}
|
||||
|
||||
# Build an overall manifest
|
||||
for key, object_query in manifest_key_to_object_query.items():
|
||||
manifest_dict[key] = json.loads(
|
||||
serializers.serialize("json", object_query),
|
||||
)
|
||||
|
||||
self.encrypt_secret_fields(manifest_dict)
|
||||
|
||||
# These are treated specially and included in the per-document manifest
|
||||
# if that setting is enabled. Otherwise, they are just exported to the bulk
|
||||
# manifest
|
||||
document_map: dict[int, Document] = {
|
||||
d.pk: d for d in manifest_key_to_object_query["documents"]
|
||||
}
|
||||
document_manifest = manifest_dict["documents"]
|
||||
|
||||
# 3. Export files from each document
|
||||
for index, document_dict in tqdm.tqdm(
|
||||
enumerate(document_manifest),
|
||||
total=len(document_manifest),
|
||||
disable=self.no_progress_bar,
|
||||
):
|
||||
# 3.1. store files unencrypted
|
||||
document_dict["fields"]["storage_type"] = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
|
||||
document = document_map[document_dict["pk"]]
|
||||
|
||||
# 3.2. generate a unique filename
|
||||
base_name = self.generate_base_name(document)
|
||||
|
||||
# 3.3. write filenames into manifest
|
||||
original_target, thumbnail_target, archive_target = (
|
||||
self.generate_document_targets(document, base_name, document_dict)
|
||||
)
|
||||
|
||||
# 3.4. write files to target folder
|
||||
if not self.data_only:
|
||||
self.copy_document_files(
|
||||
document,
|
||||
original_target,
|
||||
thumbnail_target,
|
||||
archive_target,
|
||||
)
|
||||
|
||||
if self.split_manifest:
|
||||
manifest_name = base_name.with_name(f"{base_name.stem}-manifest.json")
|
||||
if self.use_folder_prefix:
|
||||
manifest_name = Path("json") / manifest_name
|
||||
manifest_name = (self.target / manifest_name).resolve()
|
||||
manifest_name.parent.mkdir(parents=True, exist_ok=True)
|
||||
content = [document_manifest[index]]
|
||||
content += list(
|
||||
filter(
|
||||
lambda d: d["fields"]["document"] == document_dict["pk"],
|
||||
manifest_dict["notes"],
|
||||
),
|
||||
)
|
||||
content += list(
|
||||
filter(
|
||||
lambda d: d["fields"]["document"] == document_dict["pk"],
|
||||
manifest_dict["custom_field_instances"],
|
||||
),
|
||||
)
|
||||
|
||||
self.check_and_write_json(
|
||||
content,
|
||||
manifest_name,
|
||||
)
|
||||
|
||||
# These were exported already
|
||||
if self.split_manifest:
|
||||
del manifest_dict["documents"]
|
||||
del manifest_dict["notes"]
|
||||
del manifest_dict["custom_field_instances"]
|
||||
|
||||
# 4.1 write primary manifest to target folder
|
||||
manifest = []
|
||||
for key, item in manifest_dict.items():
|
||||
manifest.extend(item)
|
||||
manifest_path = (self.target / "manifest.json").resolve()
|
||||
self.check_and_write_json(
|
||||
manifest,
|
||||
manifest_path,
|
||||
)
|
||||
|
||||
# 4.2 write version information to target folder
|
||||
extra_metadata_path = (self.target / "metadata.json").resolve()
|
||||
metadata: dict[str, str | int | dict[str, str | int]] = {
|
||||
"version": version.__full_version_str__,
|
||||
}
|
||||
|
||||
# 4.2.1 If needed, write the crypto values into the metadata
|
||||
# Django stores most of these in the field itself, we store them once here
|
||||
if self.passphrase:
|
||||
metadata.update(self.get_crypt_params())
|
||||
|
||||
self.check_and_write_json(
|
||||
metadata,
|
||||
extra_metadata_path,
|
||||
)
|
||||
|
||||
if self.delete:
|
||||
# 5. Remove files which we did not explicitly export in this run
|
||||
if not self.zip_export:
|
||||
for f in self.files_in_export_dir:
|
||||
f.unlink()
|
||||
|
||||
delete_empty_directories(
|
||||
f.parent,
|
||||
self.target,
|
||||
)
|
||||
else:
|
||||
# 5. Remove anything in the original location (before moving the zip)
|
||||
for item in self.original_target.glob("*"):
|
||||
if item.is_dir():
|
||||
shutil.rmtree(item)
|
||||
else:
|
||||
item.unlink()
|
||||
|
||||
def generate_base_name(self, document: Document) -> Path:
|
||||
"""
|
||||
Generates a unique name for the document, one which hasn't already been exported (or will be)
|
||||
"""
|
||||
filename_counter = 0
|
||||
while True:
|
||||
if self.use_filename_format:
|
||||
base_name = generate_filename(
|
||||
document,
|
||||
counter=filename_counter,
|
||||
append_gpg=False,
|
||||
)
|
||||
else:
|
||||
base_name = document.get_public_filename(counter=filename_counter)
|
||||
|
||||
if base_name not in self.exported_files:
|
||||
self.exported_files.add(base_name)
|
||||
break
|
||||
else:
|
||||
filename_counter += 1
|
||||
return Path(base_name)
|
||||
|
||||
def generate_document_targets(
|
||||
self,
|
||||
document: Document,
|
||||
base_name: Path,
|
||||
document_dict: dict,
|
||||
) -> tuple[Path, Path | None, Path | None]:
|
||||
"""
|
||||
Generates the targets for a given document, including the original file, archive file and thumbnail (depending on settings).
|
||||
"""
|
||||
original_name = base_name
|
||||
if self.use_folder_prefix:
|
||||
original_name = Path("originals") / original_name
|
||||
original_target = (self.target / original_name).resolve()
|
||||
document_dict[EXPORTER_FILE_NAME] = str(original_name)
|
||||
|
||||
if not self.no_thumbnail:
|
||||
thumbnail_name = base_name.parent / (base_name.stem + "-thumbnail.webp")
|
||||
if self.use_folder_prefix:
|
||||
thumbnail_name = Path("thumbnails") / thumbnail_name
|
||||
thumbnail_target = (self.target / thumbnail_name).resolve()
|
||||
document_dict[EXPORTER_THUMBNAIL_NAME] = str(thumbnail_name)
|
||||
else:
|
||||
thumbnail_target = None
|
||||
|
||||
if not self.no_archive and document.has_archive_version:
|
||||
archive_name = base_name.parent / (base_name.stem + "-archive.pdf")
|
||||
if self.use_folder_prefix:
|
||||
archive_name = Path("archive") / archive_name
|
||||
archive_target = (self.target / archive_name).resolve()
|
||||
document_dict[EXPORTER_ARCHIVE_NAME] = str(archive_name)
|
||||
else:
|
||||
archive_target = None
|
||||
|
||||
return original_target, thumbnail_target, archive_target
|
||||
|
||||
def copy_document_files(
|
||||
self,
|
||||
document: Document,
|
||||
original_target: Path,
|
||||
thumbnail_target: Path | None,
|
||||
archive_target: Path | None,
|
||||
) -> None:
|
||||
"""
|
||||
Copies files from the document storage location to the specified target location.
|
||||
|
||||
If the document is encrypted, the files are decrypted before copying them to the target location.
|
||||
"""
|
||||
if document.storage_type == Document.STORAGE_TYPE_GPG:
|
||||
t = int(time.mktime(document.created.timetuple()))
|
||||
|
||||
original_target.parent.mkdir(parents=True, exist_ok=True)
|
||||
with document.source_file as out_file:
|
||||
original_target.write_bytes(GnuPG.decrypted(out_file))
|
||||
os.utime(original_target, times=(t, t))
|
||||
|
||||
if thumbnail_target:
|
||||
thumbnail_target.parent.mkdir(parents=True, exist_ok=True)
|
||||
with document.thumbnail_file as out_file:
|
||||
thumbnail_target.write_bytes(GnuPG.decrypted(out_file))
|
||||
os.utime(thumbnail_target, times=(t, t))
|
||||
|
||||
if archive_target:
|
||||
archive_target.parent.mkdir(parents=True, exist_ok=True)
|
||||
if TYPE_CHECKING:
|
||||
assert isinstance(document.archive_path, Path)
|
||||
with document.archive_path as out_file:
|
||||
archive_target.write_bytes(GnuPG.decrypted(out_file))
|
||||
os.utime(archive_target, times=(t, t))
|
||||
else:
|
||||
self.check_and_copy(
|
||||
document.source_path,
|
||||
document.checksum,
|
||||
original_target,
|
||||
)
|
||||
|
||||
if thumbnail_target:
|
||||
self.check_and_copy(document.thumbnail_path, None, thumbnail_target)
|
||||
|
||||
if archive_target:
|
||||
if TYPE_CHECKING:
|
||||
assert isinstance(document.archive_path, Path)
|
||||
self.check_and_copy(
|
||||
document.archive_path,
|
||||
document.archive_checksum,
|
||||
archive_target,
|
||||
)
|
||||
|
||||
def check_and_write_json(
|
||||
self,
|
||||
content: list[dict] | dict,
|
||||
target: Path,
|
||||
):
|
||||
"""
|
||||
Writes the source content to the target json file.
|
||||
If --compare-json arg was used, don't write to target file if
|
||||
the file exists and checksum is identical to content checksum.
|
||||
This preserves the file timestamps when no changes are made.
|
||||
"""
|
||||
|
||||
target = target.resolve()
|
||||
perform_write = True
|
||||
if target in self.files_in_export_dir:
|
||||
self.files_in_export_dir.remove(target)
|
||||
if self.compare_json:
|
||||
target_checksum = hashlib.md5(target.read_bytes()).hexdigest()
|
||||
src_str = json.dumps(content, indent=2, ensure_ascii=False)
|
||||
src_checksum = hashlib.md5(src_str.encode("utf-8")).hexdigest()
|
||||
if src_checksum == target_checksum:
|
||||
perform_write = False
|
||||
|
||||
if perform_write:
|
||||
target.write_text(
|
||||
json.dumps(content, indent=2, ensure_ascii=False),
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
def check_and_copy(
|
||||
self,
|
||||
source: Path,
|
||||
source_checksum: str | None,
|
||||
target: Path,
|
||||
):
|
||||
"""
|
||||
Copies the source to the target, if target doesn't exist or the target doesn't seem to match
|
||||
the source attributes
|
||||
"""
|
||||
|
||||
target = target.resolve()
|
||||
if target in self.files_in_export_dir:
|
||||
self.files_in_export_dir.remove(target)
|
||||
|
||||
perform_copy = False
|
||||
|
||||
if target.exists():
|
||||
source_stat = source.stat()
|
||||
target_stat = target.stat()
|
||||
if self.compare_checksums and source_checksum:
|
||||
target_checksum = hashlib.md5(target.read_bytes()).hexdigest()
|
||||
perform_copy = target_checksum != source_checksum
|
||||
elif (
|
||||
source_stat.st_mtime != target_stat.st_mtime
|
||||
or source_stat.st_size != target_stat.st_size
|
||||
):
|
||||
perform_copy = True
|
||||
else:
|
||||
# Copy if it does not exist
|
||||
perform_copy = True
|
||||
|
||||
if perform_copy:
|
||||
target.parent.mkdir(parents=True, exist_ok=True)
|
||||
copy_file_with_basic_stats(source, target)
|
||||
|
||||
def encrypt_secret_fields(self, manifest: dict) -> None:
|
||||
"""
|
||||
Encrypts certain fields in the export. Currently limited to the mail account password
|
||||
"""
|
||||
|
||||
if self.passphrase:
|
||||
self.setup_crypto(passphrase=self.passphrase)
|
||||
|
||||
for crypt_config in self.CRYPT_FIELDS:
|
||||
exporter_key = crypt_config["exporter_key"]
|
||||
crypt_fields = crypt_config["fields"]
|
||||
for manifest_record in manifest[exporter_key]:
|
||||
for field in crypt_fields:
|
||||
if manifest_record["fields"][field]:
|
||||
manifest_record["fields"][field] = self.encrypt_string(
|
||||
value=manifest_record["fields"][field],
|
||||
)
|
||||
|
||||
elif MailAccount.objects.count() > 0 or SocialToken.objects.count() > 0:
|
||||
self.stdout.write(
|
||||
self.style.NOTICE(
|
||||
"No passphrase was given, sensitive fields will be in plaintext",
|
||||
),
|
||||
)
|
||||
149
src/documents/management/commands/document_fuzzy_match.py
Normal file
149
src/documents/management/commands/document_fuzzy_match.py
Normal file
@@ -0,0 +1,149 @@
|
||||
import dataclasses
|
||||
import multiprocessing
|
||||
from typing import Final
|
||||
|
||||
import rapidfuzz
|
||||
import tqdm
|
||||
from django.core.management import BaseCommand
|
||||
from django.core.management import CommandError
|
||||
|
||||
from documents.management.commands.mixins import MultiProcessMixin
|
||||
from documents.management.commands.mixins import ProgressBarMixin
|
||||
from documents.models import Document
|
||||
|
||||
|
||||
@dataclasses.dataclass(frozen=True)
|
||||
class _WorkPackage:
|
||||
first_doc: Document
|
||||
second_doc: Document
|
||||
|
||||
|
||||
@dataclasses.dataclass(frozen=True)
|
||||
class _WorkResult:
|
||||
doc_one_pk: int
|
||||
doc_two_pk: int
|
||||
ratio: float
|
||||
|
||||
def __lt__(self, other: "_WorkResult") -> bool:
|
||||
return self.doc_one_pk < other.doc_one_pk
|
||||
|
||||
|
||||
def _process_and_match(work: _WorkPackage) -> _WorkResult:
|
||||
"""
|
||||
Does basic processing of document content, gets the basic ratio
|
||||
and returns the result package
|
||||
"""
|
||||
# Normalize the string some, lower case, whitespace, etc
|
||||
first_string = rapidfuzz.utils.default_process(work.first_doc.content)
|
||||
second_string = rapidfuzz.utils.default_process(work.second_doc.content)
|
||||
|
||||
# Basic matching ratio
|
||||
match = rapidfuzz.fuzz.ratio(first_string, second_string)
|
||||
|
||||
return _WorkResult(work.first_doc.pk, work.second_doc.pk, match)
|
||||
|
||||
|
||||
class Command(MultiProcessMixin, ProgressBarMixin, BaseCommand):
|
||||
help = "Searches for documents where the content almost matches"
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument(
|
||||
"--ratio",
|
||||
default=85.0,
|
||||
type=float,
|
||||
help="Ratio to consider documents a match",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--delete",
|
||||
default=False,
|
||||
action="store_true",
|
||||
help="If set, one document of matches above the ratio WILL BE DELETED",
|
||||
)
|
||||
self.add_argument_progress_bar_mixin(parser)
|
||||
self.add_argument_processes_mixin(parser)
|
||||
|
||||
def handle(self, *args, **options):
|
||||
RATIO_MIN: Final[float] = 0.0
|
||||
RATIO_MAX: Final[float] = 100.0
|
||||
|
||||
self.handle_processes_mixin(**options)
|
||||
self.handle_progress_bar_mixin(**options)
|
||||
|
||||
if options["delete"]:
|
||||
self.stdout.write(
|
||||
self.style.WARNING(
|
||||
"The command is configured to delete documents. Use with caution",
|
||||
),
|
||||
)
|
||||
|
||||
opt_ratio = options["ratio"]
|
||||
checked_pairs: set[tuple[int, int]] = set()
|
||||
work_pkgs: list[_WorkPackage] = []
|
||||
|
||||
# Ratio is a float from 0.0 to 100.0
|
||||
if opt_ratio < RATIO_MIN or opt_ratio > RATIO_MAX:
|
||||
raise CommandError("The ratio must be between 0 and 100")
|
||||
|
||||
all_docs = Document.objects.all().order_by("id")
|
||||
|
||||
# Build work packages for processing
|
||||
for first_doc in all_docs:
|
||||
for second_doc in all_docs:
|
||||
# doc to doc is obviously not useful
|
||||
if first_doc.pk == second_doc.pk:
|
||||
continue
|
||||
# Skip empty documents (e.g. password-protected)
|
||||
if first_doc.content.strip() == "" or second_doc.content.strip() == "":
|
||||
continue
|
||||
# Skip matching which have already been matched together
|
||||
# doc 1 to doc 2 is the same as doc 2 to doc 1
|
||||
doc_1_to_doc_2 = (first_doc.pk, second_doc.pk)
|
||||
doc_2_to_doc_1 = doc_1_to_doc_2[::-1]
|
||||
if doc_1_to_doc_2 in checked_pairs or doc_2_to_doc_1 in checked_pairs:
|
||||
continue
|
||||
checked_pairs.update([doc_1_to_doc_2, doc_2_to_doc_1])
|
||||
# Actually something useful to work on now
|
||||
work_pkgs.append(_WorkPackage(first_doc, second_doc))
|
||||
|
||||
# Don't spin up a pool of 1 process
|
||||
if self.process_count == 1:
|
||||
results = []
|
||||
for work in tqdm.tqdm(work_pkgs, disable=self.no_progress_bar):
|
||||
results.append(_process_and_match(work))
|
||||
else: # pragma: no cover
|
||||
with multiprocessing.Pool(processes=self.process_count) as pool:
|
||||
results = list(
|
||||
tqdm.tqdm(
|
||||
pool.imap_unordered(_process_and_match, work_pkgs),
|
||||
total=len(work_pkgs),
|
||||
disable=self.no_progress_bar,
|
||||
),
|
||||
)
|
||||
|
||||
# Check results
|
||||
messages = []
|
||||
maybe_delete_ids = []
|
||||
for result in sorted(results):
|
||||
if result.ratio >= opt_ratio:
|
||||
messages.append(
|
||||
self.style.NOTICE(
|
||||
f"Document {result.doc_one_pk} fuzzy match"
|
||||
f" to {result.doc_two_pk} (confidence {result.ratio:.3f})\n",
|
||||
),
|
||||
)
|
||||
maybe_delete_ids.append(result.doc_two_pk)
|
||||
|
||||
if len(messages) == 0:
|
||||
messages.append(
|
||||
self.style.SUCCESS("No matches found\n"),
|
||||
)
|
||||
self.stdout.writelines(
|
||||
messages,
|
||||
)
|
||||
if options["delete"]:
|
||||
self.stdout.write(
|
||||
self.style.NOTICE(
|
||||
f"Deleting {len(maybe_delete_ids)} documents based on ratio matches",
|
||||
),
|
||||
)
|
||||
Document.objects.filter(pk__in=maybe_delete_ids).delete()
|
||||
448
src/documents/management/commands/document_importer.py
Normal file
448
src/documents/management/commands/document_importer.py
Normal file
@@ -0,0 +1,448 @@
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import tempfile
|
||||
from collections.abc import Generator
|
||||
from contextlib import contextmanager
|
||||
from pathlib import Path
|
||||
from zipfile import ZipFile
|
||||
from zipfile import is_zipfile
|
||||
|
||||
import tqdm
|
||||
from django.conf import settings
|
||||
from django.contrib.auth.models import Permission
|
||||
from django.contrib.auth.models import User
|
||||
from django.contrib.contenttypes.models import ContentType
|
||||
from django.core.exceptions import FieldDoesNotExist
|
||||
from django.core.management import call_command
|
||||
from django.core.management.base import BaseCommand
|
||||
from django.core.management.base import CommandError
|
||||
from django.core.serializers.base import DeserializationError
|
||||
from django.db import IntegrityError
|
||||
from django.db import transaction
|
||||
from django.db.models.signals import m2m_changed
|
||||
from django.db.models.signals import post_save
|
||||
from filelock import FileLock
|
||||
|
||||
from documents.file_handling import create_source_path_directory
|
||||
from documents.management.commands.mixins import CryptMixin
|
||||
from documents.models import Correspondent
|
||||
from documents.models import CustomField
|
||||
from documents.models import CustomFieldInstance
|
||||
from documents.models import Document
|
||||
from documents.models import DocumentType
|
||||
from documents.models import Note
|
||||
from documents.models import Tag
|
||||
from documents.parsers import run_convert
|
||||
from documents.settings import EXPORTER_ARCHIVE_NAME
|
||||
from documents.settings import EXPORTER_CRYPTO_SETTINGS_NAME
|
||||
from documents.settings import EXPORTER_FILE_NAME
|
||||
from documents.settings import EXPORTER_THUMBNAIL_NAME
|
||||
from documents.signals.handlers import check_paths_and_prune_custom_fields
|
||||
from documents.signals.handlers import update_filename_and_move_files
|
||||
from documents.utils import copy_file_with_basic_stats
|
||||
from paperless import version
|
||||
|
||||
if settings.AUDIT_LOG_ENABLED:
|
||||
from auditlog.registry import auditlog
|
||||
|
||||
|
||||
@contextmanager
|
||||
def disable_signal(sig, receiver, sender) -> Generator:
|
||||
try:
|
||||
sig.disconnect(receiver=receiver, sender=sender)
|
||||
yield
|
||||
finally:
|
||||
sig.connect(receiver=receiver, sender=sender)
|
||||
|
||||
|
||||
class Command(CryptMixin, BaseCommand):
|
||||
help = (
|
||||
"Using a manifest.json file, load the data from there, and import the "
|
||||
"documents it refers to."
|
||||
)
|
||||
|
||||
def add_arguments(self, parser) -> None:
|
||||
parser.add_argument("source")
|
||||
|
||||
parser.add_argument(
|
||||
"--no-progress-bar",
|
||||
default=False,
|
||||
action="store_true",
|
||||
help="If set, the progress bar will not be shown",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--data-only",
|
||||
default=False,
|
||||
action="store_true",
|
||||
help="If set, only the database will be exported, not files",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--passphrase",
|
||||
help="If provided, is used to sensitive fields in the export",
|
||||
)
|
||||
|
||||
def pre_check(self) -> None:
|
||||
"""
|
||||
Runs some initial checks against the state of the install and source, including:
|
||||
- Does the target exist?
|
||||
- Can we access the target?
|
||||
- Does the target have a manifest file?
|
||||
- Are there existing files in the document folders?
|
||||
- Are there existing users or documents in the database?
|
||||
"""
|
||||
|
||||
def pre_check_maybe_not_empty() -> None:
|
||||
# Skip this check if operating only on the database
|
||||
# We can expect data to exist in that case
|
||||
if not self.data_only:
|
||||
for document_dir in [settings.ORIGINALS_DIR, settings.ARCHIVE_DIR]:
|
||||
if document_dir.exists() and document_dir.is_dir():
|
||||
for entry in document_dir.glob("**/*"):
|
||||
if entry.is_dir():
|
||||
continue
|
||||
self.stdout.write(
|
||||
self.style.WARNING(
|
||||
f"Found file {entry.relative_to(document_dir)}, this might indicate a non-empty installation",
|
||||
),
|
||||
)
|
||||
break
|
||||
# But existing users or other data still matters in a data only
|
||||
if (
|
||||
User.objects.exclude(username__in=["consumer", "AnonymousUser"]).count()
|
||||
!= 0
|
||||
):
|
||||
self.stdout.write(
|
||||
self.style.WARNING(
|
||||
"Found existing user(s), this might indicate a non-empty installation",
|
||||
),
|
||||
)
|
||||
if Document.objects.count() != 0:
|
||||
self.stdout.write(
|
||||
self.style.WARNING(
|
||||
"Found existing documents(s), this might indicate a non-empty installation",
|
||||
),
|
||||
)
|
||||
|
||||
def pre_check_manifest_exists() -> None:
|
||||
if not (self.source / "manifest.json").exists():
|
||||
raise CommandError(
|
||||
"That directory doesn't appear to contain a manifest.json file.",
|
||||
)
|
||||
|
||||
if not self.source.exists():
|
||||
raise CommandError("That path doesn't exist")
|
||||
|
||||
if not os.access(self.source, os.R_OK):
|
||||
raise CommandError("That path doesn't appear to be readable")
|
||||
|
||||
pre_check_maybe_not_empty()
|
||||
pre_check_manifest_exists()
|
||||
|
||||
def load_manifest_files(self) -> None:
|
||||
"""
|
||||
Loads manifest data from the various JSON files for parsing and loading the database
|
||||
"""
|
||||
main_manifest_path: Path = self.source / "manifest.json"
|
||||
|
||||
with main_manifest_path.open() as infile:
|
||||
self.manifest = json.load(infile)
|
||||
self.manifest_paths.append(main_manifest_path)
|
||||
|
||||
for file in Path(self.source).glob("**/*-manifest.json"):
|
||||
with file.open() as infile:
|
||||
self.manifest += json.load(infile)
|
||||
self.manifest_paths.append(file)
|
||||
|
||||
def load_metadata(self) -> None:
|
||||
"""
|
||||
Loads either just the version information or the version information and extra data
|
||||
|
||||
Must account for the old style of export as well, with just version.json
|
||||
"""
|
||||
version_path: Path = self.source / "version.json"
|
||||
metadata_path: Path = self.source / "metadata.json"
|
||||
if not version_path.exists() and not metadata_path.exists():
|
||||
self.stdout.write(
|
||||
self.style.NOTICE("No version.json or metadata.json file located"),
|
||||
)
|
||||
return
|
||||
|
||||
if metadata_path.exists():
|
||||
with metadata_path.open() as infile:
|
||||
data = json.load(infile)
|
||||
self.version = data["version"]
|
||||
if not self.passphrase and EXPORTER_CRYPTO_SETTINGS_NAME in data:
|
||||
raise CommandError(
|
||||
"No passphrase was given, but this export contains encrypted fields",
|
||||
)
|
||||
elif EXPORTER_CRYPTO_SETTINGS_NAME in data:
|
||||
self.load_crypt_params(data)
|
||||
elif version_path.exists():
|
||||
with version_path.open() as infile:
|
||||
self.version = json.load(infile)["version"]
|
||||
|
||||
if self.version and self.version != version.__full_version_str__:
|
||||
self.stdout.write(
|
||||
self.style.WARNING(
|
||||
"Version mismatch: "
|
||||
f"Currently {version.__full_version_str__},"
|
||||
f" importing {self.version}."
|
||||
" Continuing, but import may fail.",
|
||||
),
|
||||
)
|
||||
|
||||
def load_data_to_database(self) -> None:
|
||||
"""
|
||||
As the name implies, loads data from the JSON file(s) into the database
|
||||
"""
|
||||
try:
|
||||
with transaction.atomic():
|
||||
# delete these since pk can change, re-created from import
|
||||
ContentType.objects.all().delete()
|
||||
Permission.objects.all().delete()
|
||||
for manifest_path in self.manifest_paths:
|
||||
call_command("loaddata", manifest_path)
|
||||
except (FieldDoesNotExist, DeserializationError, IntegrityError) as e:
|
||||
self.stdout.write(self.style.ERROR("Database import failed"))
|
||||
if (
|
||||
self.version is not None
|
||||
and self.version != version.__full_version_str__
|
||||
): # pragma: no cover
|
||||
self.stdout.write(
|
||||
self.style.ERROR(
|
||||
"Version mismatch: "
|
||||
f"Currently {version.__full_version_str__},"
|
||||
f" importing {self.version}",
|
||||
),
|
||||
)
|
||||
raise e
|
||||
else:
|
||||
self.stdout.write(
|
||||
self.style.ERROR("No version information present"),
|
||||
)
|
||||
raise e
|
||||
|
||||
def handle(self, *args, **options) -> None:
|
||||
logging.getLogger().handlers[0].level = logging.ERROR
|
||||
|
||||
self.source = Path(options["source"]).resolve()
|
||||
self.data_only: bool = options["data_only"]
|
||||
self.no_progress_bar: bool = options["no_progress_bar"]
|
||||
self.passphrase: str | None = options.get("passphrase")
|
||||
self.version: str | None = None
|
||||
self.salt: str | None = None
|
||||
self.manifest_paths = []
|
||||
self.manifest = []
|
||||
|
||||
# Create a temporary directory for extracting a zip file into it, even if supplied source is no zip file to keep code cleaner.
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
if is_zipfile(self.source):
|
||||
with ZipFile(self.source) as zf:
|
||||
zf.extractall(tmp_dir)
|
||||
self.source = Path(tmp_dir)
|
||||
self._run_import()
|
||||
|
||||
def _run_import(self):
|
||||
self.pre_check()
|
||||
self.load_metadata()
|
||||
self.load_manifest_files()
|
||||
self.check_manifest_validity()
|
||||
self.decrypt_secret_fields()
|
||||
|
||||
# see /src/documents/signals/handlers.py
|
||||
with (
|
||||
disable_signal(
|
||||
post_save,
|
||||
receiver=update_filename_and_move_files,
|
||||
sender=Document,
|
||||
),
|
||||
disable_signal(
|
||||
m2m_changed,
|
||||
receiver=update_filename_and_move_files,
|
||||
sender=Document.tags.through,
|
||||
),
|
||||
disable_signal(
|
||||
post_save,
|
||||
receiver=update_filename_and_move_files,
|
||||
sender=CustomFieldInstance,
|
||||
),
|
||||
disable_signal(
|
||||
post_save,
|
||||
receiver=check_paths_and_prune_custom_fields,
|
||||
sender=CustomField,
|
||||
),
|
||||
):
|
||||
if settings.AUDIT_LOG_ENABLED:
|
||||
auditlog.unregister(Document)
|
||||
auditlog.unregister(Correspondent)
|
||||
auditlog.unregister(Tag)
|
||||
auditlog.unregister(DocumentType)
|
||||
auditlog.unregister(Note)
|
||||
auditlog.unregister(CustomField)
|
||||
auditlog.unregister(CustomFieldInstance)
|
||||
|
||||
# Fill up the database with whatever is in the manifest
|
||||
self.load_data_to_database()
|
||||
|
||||
if not self.data_only:
|
||||
self._import_files_from_manifest()
|
||||
else:
|
||||
self.stdout.write(self.style.NOTICE("Data only import completed"))
|
||||
|
||||
self.stdout.write("Updating search index...")
|
||||
call_command(
|
||||
"document_index",
|
||||
"reindex",
|
||||
no_progress_bar=self.no_progress_bar,
|
||||
)
|
||||
|
||||
def check_manifest_validity(self) -> None:
|
||||
"""
|
||||
Attempts to verify the manifest is valid. Namely checking the files
|
||||
referred to exist and the files can be read from
|
||||
"""
|
||||
|
||||
def check_document_validity(document_record: dict) -> None:
|
||||
if EXPORTER_FILE_NAME not in document_record:
|
||||
raise CommandError(
|
||||
"The manifest file contains a record which does not "
|
||||
"refer to an actual document file.",
|
||||
)
|
||||
|
||||
doc_file = document_record[EXPORTER_FILE_NAME]
|
||||
doc_path: Path = self.source / doc_file
|
||||
if not doc_path.exists():
|
||||
raise CommandError(
|
||||
f'The manifest file refers to "{doc_file}" which does not '
|
||||
"appear to be in the source directory.",
|
||||
)
|
||||
try:
|
||||
with doc_path.open(mode="rb"):
|
||||
pass
|
||||
except Exception as e:
|
||||
raise CommandError(
|
||||
f"Failed to read from original file {doc_path}",
|
||||
) from e
|
||||
|
||||
if EXPORTER_ARCHIVE_NAME in document_record:
|
||||
archive_file = document_record[EXPORTER_ARCHIVE_NAME]
|
||||
doc_archive_path: Path = self.source / archive_file
|
||||
if not doc_archive_path.exists():
|
||||
raise CommandError(
|
||||
f"The manifest file refers to {archive_file} which "
|
||||
f"does not appear to be in the source directory.",
|
||||
)
|
||||
try:
|
||||
with doc_archive_path.open(mode="rb"):
|
||||
pass
|
||||
except Exception as e:
|
||||
raise CommandError(
|
||||
f"Failed to read from archive file {doc_archive_path}",
|
||||
) from e
|
||||
|
||||
self.stdout.write("Checking the manifest")
|
||||
for record in self.manifest:
|
||||
# Only check if the document files exist if this is not data only
|
||||
# We don't care about documents for a data only import
|
||||
if not self.data_only and record["model"] == "documents.document":
|
||||
check_document_validity(record)
|
||||
|
||||
def _import_files_from_manifest(self) -> None:
|
||||
settings.ORIGINALS_DIR.mkdir(parents=True, exist_ok=True)
|
||||
settings.THUMBNAIL_DIR.mkdir(parents=True, exist_ok=True)
|
||||
settings.ARCHIVE_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
self.stdout.write("Copy files into paperless...")
|
||||
|
||||
manifest_documents = list(
|
||||
filter(lambda r: r["model"] == "documents.document", self.manifest),
|
||||
)
|
||||
|
||||
for record in tqdm.tqdm(manifest_documents, disable=self.no_progress_bar):
|
||||
document = Document.objects.get(pk=record["pk"])
|
||||
|
||||
doc_file = record[EXPORTER_FILE_NAME]
|
||||
document_path = self.source / doc_file
|
||||
|
||||
if EXPORTER_THUMBNAIL_NAME in record:
|
||||
thumb_file = record[EXPORTER_THUMBNAIL_NAME]
|
||||
thumbnail_path = (self.source / thumb_file).resolve()
|
||||
else:
|
||||
thumbnail_path = None
|
||||
|
||||
if EXPORTER_ARCHIVE_NAME in record:
|
||||
archive_file = record[EXPORTER_ARCHIVE_NAME]
|
||||
archive_path = self.source / archive_file
|
||||
else:
|
||||
archive_path = None
|
||||
|
||||
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
|
||||
with FileLock(settings.MEDIA_LOCK):
|
||||
if Path(document.source_path).is_file():
|
||||
raise FileExistsError(document.source_path)
|
||||
|
||||
create_source_path_directory(document.source_path)
|
||||
|
||||
copy_file_with_basic_stats(document_path, document.source_path)
|
||||
|
||||
if thumbnail_path:
|
||||
if thumbnail_path.suffix in {".png", ".PNG"}:
|
||||
run_convert(
|
||||
density=300,
|
||||
scale="500x5000>",
|
||||
alpha="remove",
|
||||
strip=True,
|
||||
trim=False,
|
||||
auto_orient=True,
|
||||
input_file=f"{thumbnail_path}[0]",
|
||||
output_file=str(document.thumbnail_path),
|
||||
)
|
||||
else:
|
||||
copy_file_with_basic_stats(
|
||||
thumbnail_path,
|
||||
document.thumbnail_path,
|
||||
)
|
||||
|
||||
if archive_path:
|
||||
create_source_path_directory(document.archive_path)
|
||||
# TODO: this assumes that the export is valid and
|
||||
# archive_filename is present on all documents with
|
||||
# archived files
|
||||
copy_file_with_basic_stats(archive_path, document.archive_path)
|
||||
|
||||
document.save()
|
||||
|
||||
def decrypt_secret_fields(self) -> None:
|
||||
"""
|
||||
The converse decryption of some fields out of the export before importing to database
|
||||
"""
|
||||
if self.passphrase:
|
||||
# Salt has been loaded from metadata.json at this point, so it cannot be None
|
||||
self.setup_crypto(passphrase=self.passphrase, salt=self.salt)
|
||||
|
||||
had_at_least_one_record = False
|
||||
|
||||
for crypt_config in self.CRYPT_FIELDS:
|
||||
importer_model: str = crypt_config["model_name"]
|
||||
crypt_fields: str = crypt_config["fields"]
|
||||
for record in filter(
|
||||
lambda x: x["model"] == importer_model,
|
||||
self.manifest,
|
||||
):
|
||||
had_at_least_one_record = True
|
||||
for field in crypt_fields:
|
||||
if record["fields"][field]:
|
||||
record["fields"][field] = self.decrypt_string(
|
||||
value=record["fields"][field],
|
||||
)
|
||||
|
||||
if had_at_least_one_record:
|
||||
# It's annoying, but the DB is loaded from the JSON directly
|
||||
# Maybe could change that in the future?
|
||||
(self.source / "manifest.json").write_text(
|
||||
json.dumps(self.manifest, indent=2, ensure_ascii=False),
|
||||
)
|
||||
22
src/documents/management/commands/document_index.py
Normal file
22
src/documents/management/commands/document_index.py
Normal file
@@ -0,0 +1,22 @@
|
||||
from django.core.management import BaseCommand
|
||||
from django.db import transaction
|
||||
|
||||
from documents.management.commands.mixins import ProgressBarMixin
|
||||
from documents.tasks import index_optimize
|
||||
from documents.tasks import index_reindex
|
||||
|
||||
|
||||
class Command(ProgressBarMixin, BaseCommand):
|
||||
help = "Manages the document index."
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument("command", choices=["reindex", "optimize"])
|
||||
self.add_argument_progress_bar_mixin(parser)
|
||||
|
||||
def handle(self, *args, **options):
|
||||
self.handle_progress_bar_mixin(**options)
|
||||
with transaction.atomic():
|
||||
if options["command"] == "reindex":
|
||||
index_reindex(progress_bar_disable=self.no_progress_bar)
|
||||
elif options["command"] == "optimize":
|
||||
index_optimize()
|
||||
25
src/documents/management/commands/document_renamer.py
Normal file
25
src/documents/management/commands/document_renamer.py
Normal file
@@ -0,0 +1,25 @@
|
||||
import logging
|
||||
|
||||
import tqdm
|
||||
from django.core.management.base import BaseCommand
|
||||
from django.db.models.signals import post_save
|
||||
|
||||
from documents.management.commands.mixins import ProgressBarMixin
|
||||
from documents.models import Document
|
||||
|
||||
|
||||
class Command(ProgressBarMixin, BaseCommand):
|
||||
help = "This will rename all documents to match the latest filename format."
|
||||
|
||||
def add_arguments(self, parser):
|
||||
self.add_argument_progress_bar_mixin(parser)
|
||||
|
||||
def handle(self, *args, **options):
|
||||
self.handle_progress_bar_mixin(**options)
|
||||
logging.getLogger().handlers[0].level = logging.ERROR
|
||||
|
||||
for document in tqdm.tqdm(
|
||||
Document.objects.all(),
|
||||
disable=self.no_progress_bar,
|
||||
):
|
||||
post_save.send(Document, instance=document, created=False)
|
||||
136
src/documents/management/commands/document_retagger.py
Normal file
136
src/documents/management/commands/document_retagger.py
Normal file
@@ -0,0 +1,136 @@
|
||||
import logging
|
||||
|
||||
import tqdm
|
||||
from django.core.management.base import BaseCommand
|
||||
|
||||
from documents.classifier import load_classifier
|
||||
from documents.management.commands.mixins import ProgressBarMixin
|
||||
from documents.models import Document
|
||||
from documents.signals.handlers import set_correspondent
|
||||
from documents.signals.handlers import set_document_type
|
||||
from documents.signals.handlers import set_storage_path
|
||||
from documents.signals.handlers import set_tags
|
||||
|
||||
logger = logging.getLogger("paperless.management.retagger")
|
||||
|
||||
|
||||
class Command(ProgressBarMixin, BaseCommand):
|
||||
help = (
|
||||
"Using the current classification model, assigns correspondents, tags "
|
||||
"and document types to all documents, effectively allowing you to "
|
||||
"back-tag all previously indexed documents with metadata created (or "
|
||||
"modified) after their initial import."
|
||||
)
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument("-c", "--correspondent", default=False, action="store_true")
|
||||
parser.add_argument("-T", "--tags", default=False, action="store_true")
|
||||
parser.add_argument("-t", "--document_type", default=False, action="store_true")
|
||||
parser.add_argument("-s", "--storage_path", default=False, action="store_true")
|
||||
parser.add_argument("-i", "--inbox-only", default=False, action="store_true")
|
||||
parser.add_argument(
|
||||
"--use-first",
|
||||
default=False,
|
||||
action="store_true",
|
||||
help=(
|
||||
"By default this command won't try to assign a correspondent "
|
||||
"if more than one matches the document. Use this flag if "
|
||||
"you'd rather it just pick the first one it finds."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"-f",
|
||||
"--overwrite",
|
||||
default=False,
|
||||
action="store_true",
|
||||
help=(
|
||||
"If set, the document retagger will overwrite any previously "
|
||||
"set correspondent, document and remove correspondents, types "
|
||||
"and tags that do not match anymore due to changed rules."
|
||||
),
|
||||
)
|
||||
self.add_argument_progress_bar_mixin(parser)
|
||||
parser.add_argument(
|
||||
"--suggest",
|
||||
default=False,
|
||||
action="store_true",
|
||||
help="Return the suggestion, don't change anything.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--base-url",
|
||||
help="The base URL to use to build the link to the documents.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--id-range",
|
||||
help="A range of document ids on which the retagging should be applied.",
|
||||
nargs=2,
|
||||
type=int,
|
||||
)
|
||||
|
||||
def handle(self, *args, **options):
|
||||
self.handle_progress_bar_mixin(**options)
|
||||
|
||||
if options["inbox_only"]:
|
||||
queryset = Document.objects.filter(tags__is_inbox_tag=True)
|
||||
else:
|
||||
queryset = Document.objects.all()
|
||||
|
||||
if options["id_range"]:
|
||||
queryset = queryset.filter(
|
||||
id__range=(options["id_range"][0], options["id_range"][1]),
|
||||
)
|
||||
|
||||
documents = queryset.distinct()
|
||||
|
||||
classifier = load_classifier()
|
||||
|
||||
for document in tqdm.tqdm(documents, disable=self.no_progress_bar):
|
||||
if options["correspondent"]:
|
||||
set_correspondent(
|
||||
sender=None,
|
||||
document=document,
|
||||
classifier=classifier,
|
||||
replace=options["overwrite"],
|
||||
use_first=options["use_first"],
|
||||
suggest=options["suggest"],
|
||||
base_url=options["base_url"],
|
||||
stdout=self.stdout,
|
||||
style_func=self.style,
|
||||
)
|
||||
|
||||
if options["document_type"]:
|
||||
set_document_type(
|
||||
sender=None,
|
||||
document=document,
|
||||
classifier=classifier,
|
||||
replace=options["overwrite"],
|
||||
use_first=options["use_first"],
|
||||
suggest=options["suggest"],
|
||||
base_url=options["base_url"],
|
||||
stdout=self.stdout,
|
||||
style_func=self.style,
|
||||
)
|
||||
|
||||
if options["tags"]:
|
||||
set_tags(
|
||||
sender=None,
|
||||
document=document,
|
||||
classifier=classifier,
|
||||
replace=options["overwrite"],
|
||||
suggest=options["suggest"],
|
||||
base_url=options["base_url"],
|
||||
stdout=self.stdout,
|
||||
style_func=self.style,
|
||||
)
|
||||
if options["storage_path"]:
|
||||
set_storage_path(
|
||||
sender=None,
|
||||
document=document,
|
||||
classifier=classifier,
|
||||
replace=options["overwrite"],
|
||||
use_first=options["use_first"],
|
||||
suggest=options["suggest"],
|
||||
base_url=options["base_url"],
|
||||
stdout=self.stdout,
|
||||
style_func=self.style,
|
||||
)
|
||||
17
src/documents/management/commands/document_sanity_checker.py
Normal file
17
src/documents/management/commands/document_sanity_checker.py
Normal file
@@ -0,0 +1,17 @@
|
||||
from django.core.management.base import BaseCommand
|
||||
|
||||
from documents.management.commands.mixins import ProgressBarMixin
|
||||
from documents.sanity_checker import check_sanity
|
||||
|
||||
|
||||
class Command(ProgressBarMixin, BaseCommand):
|
||||
help = "This command checks your document archive for issues."
|
||||
|
||||
def add_arguments(self, parser):
|
||||
self.add_argument_progress_bar_mixin(parser)
|
||||
|
||||
def handle(self, *args, **options):
|
||||
self.handle_progress_bar_mixin(**options)
|
||||
messages = check_sanity(progress=self.use_progress_bar, scheduled=False)
|
||||
|
||||
messages.log_messages()
|
||||
84
src/documents/management/commands/document_thumbnails.py
Normal file
84
src/documents/management/commands/document_thumbnails.py
Normal file
@@ -0,0 +1,84 @@
|
||||
import logging
|
||||
import multiprocessing
|
||||
import shutil
|
||||
|
||||
import tqdm
|
||||
from django import db
|
||||
from django.core.management.base import BaseCommand
|
||||
|
||||
from documents.management.commands.mixins import MultiProcessMixin
|
||||
from documents.management.commands.mixins import ProgressBarMixin
|
||||
from documents.models import Document
|
||||
from documents.parsers import get_parser_class_for_mime_type
|
||||
|
||||
|
||||
def _process_document(doc_id):
|
||||
document: Document = Document.objects.get(id=doc_id)
|
||||
parser_class = get_parser_class_for_mime_type(document.mime_type)
|
||||
|
||||
if parser_class:
|
||||
parser = parser_class(logging_group=None)
|
||||
else:
|
||||
print(f"{document} No parser for mime type {document.mime_type}") # noqa: T201
|
||||
return
|
||||
|
||||
try:
|
||||
thumb = parser.get_thumbnail(
|
||||
document.source_path,
|
||||
document.mime_type,
|
||||
document.get_public_filename(),
|
||||
)
|
||||
|
||||
shutil.move(thumb, document.thumbnail_path)
|
||||
finally:
|
||||
parser.cleanup()
|
||||
|
||||
|
||||
class Command(MultiProcessMixin, ProgressBarMixin, BaseCommand):
|
||||
help = "This will regenerate the thumbnails for all documents."
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument(
|
||||
"-d",
|
||||
"--document",
|
||||
default=None,
|
||||
type=int,
|
||||
required=False,
|
||||
help=(
|
||||
"Specify the ID of a document, and this command will only "
|
||||
"run on this specific document."
|
||||
),
|
||||
)
|
||||
self.add_argument_progress_bar_mixin(parser)
|
||||
self.add_argument_processes_mixin(parser)
|
||||
|
||||
def handle(self, *args, **options):
|
||||
logging.getLogger().handlers[0].level = logging.ERROR
|
||||
|
||||
self.handle_processes_mixin(**options)
|
||||
self.handle_progress_bar_mixin(**options)
|
||||
|
||||
if options["document"]:
|
||||
documents = Document.objects.filter(pk=options["document"])
|
||||
else:
|
||||
documents = Document.objects.all()
|
||||
|
||||
ids = [doc.id for doc in documents]
|
||||
|
||||
# Note to future self: this prevents django from reusing database
|
||||
# connections between processes, which is bad and does not work
|
||||
# with postgres.
|
||||
db.connections.close_all()
|
||||
|
||||
if self.process_count == 1:
|
||||
for doc_id in ids:
|
||||
_process_document(doc_id)
|
||||
else: # pragma: no cover
|
||||
with multiprocessing.Pool(processes=self.process_count) as pool:
|
||||
list(
|
||||
tqdm.tqdm(
|
||||
pool.imap_unordered(_process_document, ids),
|
||||
total=len(ids),
|
||||
disable=self.no_progress_bar,
|
||||
),
|
||||
)
|
||||
22
src/documents/management/commands/loaddata_stdin.py
Normal file
22
src/documents/management/commands/loaddata_stdin.py
Normal file
@@ -0,0 +1,22 @@
|
||||
import sys
|
||||
|
||||
from django.core.management.commands.loaddata import Command as LoadDataCommand
|
||||
|
||||
|
||||
# This class is used to migrate data between databases
|
||||
# That's difficult to test
|
||||
class Command(LoadDataCommand): # pragma: no cover
|
||||
"""
|
||||
Allow the loading of data from standard in. Sourced originally from:
|
||||
https://gist.github.com/bmispelon/ad5a2c333443b3a1d051 (MIT licensed)
|
||||
"""
|
||||
|
||||
def parse_name(self, fixture_name):
|
||||
self.compression_formats["stdin"] = (lambda x, y: sys.stdin, None)
|
||||
if fixture_name == "-":
|
||||
return "-", "json", "stdin"
|
||||
|
||||
def find_fixtures(self, fixture_label):
|
||||
if fixture_label == "-":
|
||||
return [("-", None, "-")]
|
||||
return super().find_fixtures(fixture_label)
|
||||
66
src/documents/management/commands/manage_superuser.py
Normal file
66
src/documents/management/commands/manage_superuser.py
Normal file
@@ -0,0 +1,66 @@
|
||||
import logging
|
||||
import os
|
||||
from argparse import RawTextHelpFormatter
|
||||
|
||||
from django.contrib.auth.models import User
|
||||
from django.core.management.base import BaseCommand
|
||||
|
||||
logger = logging.getLogger("paperless.management.superuser")
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = (
|
||||
"Creates a Django superuser:\n"
|
||||
" User named: admin\n"
|
||||
" Email: root@localhost\n"
|
||||
" Password: based on env variable PAPERLESS_ADMIN_PASSWORD\n"
|
||||
"No superuser will be created, when:\n"
|
||||
" - The username is taken already exists\n"
|
||||
" - A superuser already exists\n"
|
||||
" - PAPERLESS_ADMIN_PASSWORD is not set"
|
||||
)
|
||||
|
||||
def create_parser(self, *args, **kwargs):
|
||||
parser = super().create_parser(*args, **kwargs)
|
||||
parser.formatter_class = RawTextHelpFormatter
|
||||
return parser
|
||||
|
||||
def handle(self, *args, **options):
|
||||
username = os.getenv("PAPERLESS_ADMIN_USER", "admin")
|
||||
mail = os.getenv("PAPERLESS_ADMIN_MAIL", "root@localhost")
|
||||
password = os.getenv("PAPERLESS_ADMIN_PASSWORD")
|
||||
|
||||
# Check if there's already a user called admin
|
||||
if User.objects.filter(username=username).exists():
|
||||
self.stdout.write(
|
||||
self.style.NOTICE(
|
||||
f"Did not create superuser, a user {username} already exists",
|
||||
),
|
||||
)
|
||||
return
|
||||
|
||||
# Check if any superuseruser
|
||||
# exists already, leave as is if it does
|
||||
if User.objects.filter(is_superuser=True).count() > 0:
|
||||
self.stdout.write(
|
||||
self.style.NOTICE(
|
||||
"Did not create superuser, the DB already contains superusers",
|
||||
),
|
||||
)
|
||||
return
|
||||
|
||||
if password is None:
|
||||
self.stdout.write(
|
||||
self.style.ERROR(
|
||||
"Please check if PAPERLESS_ADMIN_PASSWORD has been"
|
||||
" set in the environment",
|
||||
),
|
||||
)
|
||||
else:
|
||||
# Create superuser with password based on env variable
|
||||
User.objects.create_superuser(username, mail, password)
|
||||
self.stdout.write(
|
||||
self.style.SUCCESS(
|
||||
f'Created superuser "{username}" with provided password.',
|
||||
),
|
||||
)
|
||||
175
src/documents/management/commands/mixins.py
Normal file
175
src/documents/management/commands/mixins.py
Normal file
@@ -0,0 +1,175 @@
|
||||
import base64
|
||||
import os
|
||||
from argparse import ArgumentParser
|
||||
from typing import TypedDict
|
||||
|
||||
from cryptography.fernet import Fernet
|
||||
from cryptography.hazmat.primitives import hashes
|
||||
from cryptography.hazmat.primitives.kdf.pbkdf2 import PBKDF2HMAC
|
||||
from django.core.management import CommandError
|
||||
|
||||
from documents.settings import EXPORTER_CRYPTO_ALGO_NAME
|
||||
from documents.settings import EXPORTER_CRYPTO_KEY_ITERATIONS_NAME
|
||||
from documents.settings import EXPORTER_CRYPTO_KEY_SIZE_NAME
|
||||
from documents.settings import EXPORTER_CRYPTO_SALT_NAME
|
||||
from documents.settings import EXPORTER_CRYPTO_SETTINGS_NAME
|
||||
|
||||
|
||||
class CryptFields(TypedDict):
|
||||
exporter_key: str
|
||||
model_name: str
|
||||
fields: list[str]
|
||||
|
||||
|
||||
class MultiProcessMixin:
|
||||
"""
|
||||
Small class to handle adding an argument and validating it
|
||||
for the use of multiple processes
|
||||
"""
|
||||
|
||||
def add_argument_processes_mixin(self, parser: ArgumentParser):
|
||||
parser.add_argument(
|
||||
"--processes",
|
||||
default=max(1, os.cpu_count() // 4),
|
||||
type=int,
|
||||
help="Number of processes to distribute work amongst",
|
||||
)
|
||||
|
||||
def handle_processes_mixin(self, *args, **options):
|
||||
self.process_count = options["processes"]
|
||||
if self.process_count < 1:
|
||||
raise CommandError("There must be at least 1 process")
|
||||
|
||||
|
||||
class ProgressBarMixin:
|
||||
"""
|
||||
Many commands use a progress bar, which can be disabled
|
||||
via this class
|
||||
"""
|
||||
|
||||
def add_argument_progress_bar_mixin(self, parser: ArgumentParser):
|
||||
parser.add_argument(
|
||||
"--no-progress-bar",
|
||||
default=False,
|
||||
action="store_true",
|
||||
help="If set, the progress bar will not be shown",
|
||||
)
|
||||
|
||||
def handle_progress_bar_mixin(self, *args, **options):
|
||||
self.no_progress_bar = options["no_progress_bar"]
|
||||
self.use_progress_bar = not self.no_progress_bar
|
||||
|
||||
|
||||
class CryptMixin:
|
||||
"""
|
||||
Fully based on:
|
||||
https://cryptography.io/en/latest/fernet/#using-passwords-with-fernet
|
||||
|
||||
To encrypt:
|
||||
1. Call setup_crypto providing the user provided passphrase
|
||||
2. Call encrypt_string with a value
|
||||
3. Store the returned hexadecimal representation of the value
|
||||
|
||||
To decrypt:
|
||||
1. Load the required parameters:
|
||||
a. key iterations
|
||||
b. key size
|
||||
c. key algorithm
|
||||
2. Call setup_crypto providing the user provided passphrase and stored salt
|
||||
3. Call decrypt_string with a value
|
||||
4. Use the returned value
|
||||
|
||||
"""
|
||||
|
||||
# This matches to Django's default for now
|
||||
# https://github.com/django/django/blob/adae61942/django/contrib/auth/hashers.py#L315
|
||||
|
||||
# Set the defaults to be used during export
|
||||
# During import, these are overridden from the loaded values to ensure decryption is possible
|
||||
key_iterations = 1_000_000
|
||||
salt_size = 16
|
||||
key_size = 32
|
||||
kdf_algorithm = "pbkdf2_sha256"
|
||||
|
||||
CRYPT_FIELDS: CryptFields = [
|
||||
{
|
||||
"exporter_key": "mail_accounts",
|
||||
"model_name": "paperless_mail.mailaccount",
|
||||
"fields": [
|
||||
"password",
|
||||
"refresh_token",
|
||||
],
|
||||
},
|
||||
{
|
||||
"exporter_key": "social_tokens",
|
||||
"model_name": "socialaccount.socialtoken",
|
||||
"fields": [
|
||||
"token",
|
||||
"token_secret",
|
||||
],
|
||||
},
|
||||
]
|
||||
|
||||
def get_crypt_params(self) -> dict[str, dict[str, str | int]]:
|
||||
return {
|
||||
EXPORTER_CRYPTO_SETTINGS_NAME: {
|
||||
EXPORTER_CRYPTO_ALGO_NAME: self.kdf_algorithm,
|
||||
EXPORTER_CRYPTO_KEY_ITERATIONS_NAME: self.key_iterations,
|
||||
EXPORTER_CRYPTO_KEY_SIZE_NAME: self.key_size,
|
||||
EXPORTER_CRYPTO_SALT_NAME: self.salt,
|
||||
},
|
||||
}
|
||||
|
||||
def load_crypt_params(self, metadata: dict):
|
||||
# Load up the values for setting up decryption
|
||||
self.kdf_algorithm: str = metadata[EXPORTER_CRYPTO_SETTINGS_NAME][
|
||||
EXPORTER_CRYPTO_ALGO_NAME
|
||||
]
|
||||
self.key_iterations: int = metadata[EXPORTER_CRYPTO_SETTINGS_NAME][
|
||||
EXPORTER_CRYPTO_KEY_ITERATIONS_NAME
|
||||
]
|
||||
self.key_size: int = metadata[EXPORTER_CRYPTO_SETTINGS_NAME][
|
||||
EXPORTER_CRYPTO_KEY_SIZE_NAME
|
||||
]
|
||||
self.salt: str = metadata[EXPORTER_CRYPTO_SETTINGS_NAME][
|
||||
EXPORTER_CRYPTO_SALT_NAME
|
||||
]
|
||||
|
||||
def setup_crypto(self, *, passphrase: str, salt: str | None = None):
|
||||
"""
|
||||
Constructs a class for encryption or decryption using the specified passphrase and salt
|
||||
|
||||
Salt is assumed to be a hexadecimal representation of a cryptographically secure random byte string.
|
||||
If not provided, it will be derived from the system secure random
|
||||
"""
|
||||
self.salt = salt or os.urandom(self.salt_size).hex()
|
||||
|
||||
# Derive the KDF based on loaded settings
|
||||
if self.kdf_algorithm == "pbkdf2_sha256":
|
||||
kdf = PBKDF2HMAC(
|
||||
algorithm=hashes.SHA256(),
|
||||
length=self.key_size,
|
||||
salt=bytes.fromhex(self.salt),
|
||||
iterations=self.key_iterations,
|
||||
)
|
||||
else: # pragma: no cover
|
||||
raise CommandError(
|
||||
f"{self.kdf_algorithm} is an unknown key derivation function",
|
||||
)
|
||||
|
||||
key = base64.urlsafe_b64encode(kdf.derive(passphrase.encode("utf-8")))
|
||||
|
||||
self.fernet = Fernet(key)
|
||||
|
||||
def encrypt_string(self, *, value: str) -> str:
|
||||
"""
|
||||
Given a string value, encrypts it and returns the hexadecimal representation of the encrypted token
|
||||
|
||||
"""
|
||||
return self.fernet.encrypt(value.encode("utf-8")).hex()
|
||||
|
||||
def decrypt_string(self, *, value: str) -> str:
|
||||
"""
|
||||
Given a string value, decrypts it and returns the original value of the field
|
||||
"""
|
||||
return self.fernet.decrypt(bytes.fromhex(value)).decode("utf-8")
|
||||
39
src/documents/management/commands/prune_audit_logs.py
Normal file
39
src/documents/management/commands/prune_audit_logs.py
Normal file
@@ -0,0 +1,39 @@
|
||||
from auditlog.models import LogEntry
|
||||
from django.core.management.base import BaseCommand
|
||||
from django.db import transaction
|
||||
from tqdm import tqdm
|
||||
|
||||
from documents.management.commands.mixins import ProgressBarMixin
|
||||
|
||||
|
||||
class Command(BaseCommand, ProgressBarMixin):
|
||||
"""
|
||||
Prune the audit logs of objects that no longer exist.
|
||||
"""
|
||||
|
||||
help = "Prunes the audit logs of objects that no longer exist."
|
||||
|
||||
def add_arguments(self, parser):
|
||||
self.add_argument_progress_bar_mixin(parser)
|
||||
|
||||
def handle(self, **options):
|
||||
self.handle_progress_bar_mixin(**options)
|
||||
with transaction.atomic():
|
||||
for log_entry in tqdm(LogEntry.objects.all(), disable=self.no_progress_bar):
|
||||
model_class = log_entry.content_type.model_class()
|
||||
# use global_objects for SoftDeleteModel
|
||||
objects = (
|
||||
model_class.global_objects
|
||||
if hasattr(model_class, "global_objects")
|
||||
else model_class.objects
|
||||
)
|
||||
if (
|
||||
log_entry.object_id
|
||||
and not objects.filter(pk=log_entry.object_id).exists()
|
||||
):
|
||||
log_entry.delete()
|
||||
tqdm.write(
|
||||
self.style.NOTICE(
|
||||
f"Deleted audit log entry for {model_class.__name__} #{log_entry.object_id}",
|
||||
),
|
||||
)
|
||||
508
src/documents/matching.py
Normal file
508
src/documents/matching.py
Normal file
@@ -0,0 +1,508 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import re
|
||||
from fnmatch import fnmatch
|
||||
from fnmatch import translate as fnmatch_translate
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from documents.data_models import ConsumableDocument
|
||||
from documents.data_models import DocumentSource
|
||||
from documents.models import Correspondent
|
||||
from documents.models import Document
|
||||
from documents.models import DocumentType
|
||||
from documents.models import MatchingModel
|
||||
from documents.models import StoragePath
|
||||
from documents.models import Tag
|
||||
from documents.models import Workflow
|
||||
from documents.models import WorkflowTrigger
|
||||
from documents.permissions import get_objects_for_user_owner_aware
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from django.db.models import QuerySet
|
||||
|
||||
from documents.classifier import DocumentClassifier
|
||||
|
||||
logger = logging.getLogger("paperless.matching")
|
||||
|
||||
|
||||
def log_reason(
|
||||
matching_model: MatchingModel | WorkflowTrigger,
|
||||
document: Document,
|
||||
reason: str,
|
||||
):
|
||||
class_name = type(matching_model).__name__
|
||||
name = (
|
||||
matching_model.name if hasattr(matching_model, "name") else str(matching_model)
|
||||
)
|
||||
logger.debug(
|
||||
f"{class_name} {name} matched on document {document} because {reason}",
|
||||
)
|
||||
|
||||
|
||||
def match_correspondents(document: Document, classifier: DocumentClassifier, user=None):
|
||||
pred_id = (
|
||||
classifier.predict_correspondent(document.suggestion_content)
|
||||
if classifier
|
||||
else None
|
||||
)
|
||||
|
||||
if user is None and document.owner is not None:
|
||||
user = document.owner
|
||||
|
||||
if user is not None:
|
||||
correspondents = get_objects_for_user_owner_aware(
|
||||
user,
|
||||
"documents.view_correspondent",
|
||||
Correspondent,
|
||||
)
|
||||
else:
|
||||
correspondents = Correspondent.objects.all()
|
||||
|
||||
return list(
|
||||
filter(
|
||||
lambda o: matches(o, document)
|
||||
or (o.pk == pred_id and o.matching_algorithm == MatchingModel.MATCH_AUTO),
|
||||
correspondents,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def match_document_types(document: Document, classifier: DocumentClassifier, user=None):
|
||||
pred_id = (
|
||||
classifier.predict_document_type(document.suggestion_content)
|
||||
if classifier
|
||||
else None
|
||||
)
|
||||
if user is None and document.owner is not None:
|
||||
user = document.owner
|
||||
|
||||
if user is not None:
|
||||
document_types = get_objects_for_user_owner_aware(
|
||||
user,
|
||||
"documents.view_documenttype",
|
||||
DocumentType,
|
||||
)
|
||||
else:
|
||||
document_types = DocumentType.objects.all()
|
||||
|
||||
return list(
|
||||
filter(
|
||||
lambda o: matches(o, document)
|
||||
or (o.pk == pred_id and o.matching_algorithm == MatchingModel.MATCH_AUTO),
|
||||
document_types,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def match_tags(document: Document, classifier: DocumentClassifier, user=None):
|
||||
predicted_tag_ids = (
|
||||
classifier.predict_tags(document.suggestion_content) if classifier else []
|
||||
)
|
||||
|
||||
if user is None and document.owner is not None:
|
||||
user = document.owner
|
||||
|
||||
if user is not None:
|
||||
tags = get_objects_for_user_owner_aware(user, "documents.view_tag", Tag)
|
||||
else:
|
||||
tags = Tag.objects.all()
|
||||
|
||||
return list(
|
||||
filter(
|
||||
lambda o: matches(o, document)
|
||||
or (
|
||||
o.matching_algorithm == MatchingModel.MATCH_AUTO
|
||||
and o.pk in predicted_tag_ids
|
||||
),
|
||||
tags,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def match_storage_paths(document: Document, classifier: DocumentClassifier, user=None):
|
||||
pred_id = (
|
||||
classifier.predict_storage_path(document.suggestion_content)
|
||||
if classifier
|
||||
else None
|
||||
)
|
||||
|
||||
if user is None and document.owner is not None:
|
||||
user = document.owner
|
||||
|
||||
if user is not None:
|
||||
storage_paths = get_objects_for_user_owner_aware(
|
||||
user,
|
||||
"documents.view_storagepath",
|
||||
StoragePath,
|
||||
)
|
||||
else:
|
||||
storage_paths = StoragePath.objects.all()
|
||||
|
||||
return list(
|
||||
filter(
|
||||
lambda o: matches(o, document)
|
||||
or (o.pk == pred_id and o.matching_algorithm == MatchingModel.MATCH_AUTO),
|
||||
storage_paths,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def matches(matching_model: MatchingModel, document: Document):
|
||||
search_kwargs = {}
|
||||
|
||||
document_content = document.content
|
||||
|
||||
# Check that match is not empty
|
||||
if not matching_model.match.strip():
|
||||
return False
|
||||
|
||||
if matching_model.is_insensitive:
|
||||
search_kwargs = {"flags": re.IGNORECASE}
|
||||
|
||||
if matching_model.matching_algorithm == MatchingModel.MATCH_NONE:
|
||||
return False
|
||||
|
||||
elif matching_model.matching_algorithm == MatchingModel.MATCH_ALL:
|
||||
for word in _split_match(matching_model):
|
||||
search_result = re.search(rf"\b{word}\b", document_content, **search_kwargs)
|
||||
if not search_result:
|
||||
return False
|
||||
log_reason(
|
||||
matching_model,
|
||||
document,
|
||||
f"it contains all of these words: {matching_model.match}",
|
||||
)
|
||||
return True
|
||||
|
||||
elif matching_model.matching_algorithm == MatchingModel.MATCH_ANY:
|
||||
for word in _split_match(matching_model):
|
||||
if re.search(rf"\b{word}\b", document_content, **search_kwargs):
|
||||
log_reason(matching_model, document, f"it contains this word: {word}")
|
||||
return True
|
||||
return False
|
||||
|
||||
elif matching_model.matching_algorithm == MatchingModel.MATCH_LITERAL:
|
||||
result = bool(
|
||||
re.search(
|
||||
rf"\b{re.escape(matching_model.match)}\b",
|
||||
document_content,
|
||||
**search_kwargs,
|
||||
),
|
||||
)
|
||||
if result:
|
||||
log_reason(
|
||||
matching_model,
|
||||
document,
|
||||
f'it contains this string: "{matching_model.match}"',
|
||||
)
|
||||
return result
|
||||
|
||||
elif matching_model.matching_algorithm == MatchingModel.MATCH_REGEX:
|
||||
try:
|
||||
match = re.search(
|
||||
re.compile(matching_model.match, **search_kwargs),
|
||||
document_content,
|
||||
)
|
||||
except re.error:
|
||||
logger.error(
|
||||
f"Error while processing regular expression {matching_model.match}",
|
||||
)
|
||||
return False
|
||||
if match:
|
||||
log_reason(
|
||||
matching_model,
|
||||
document,
|
||||
f"the string {match.group()} matches the regular expression "
|
||||
f"{matching_model.match}",
|
||||
)
|
||||
return bool(match)
|
||||
|
||||
elif matching_model.matching_algorithm == MatchingModel.MATCH_FUZZY:
|
||||
from rapidfuzz import fuzz
|
||||
|
||||
match = re.sub(r"[^\w\s]", "", matching_model.match)
|
||||
text = re.sub(r"[^\w\s]", "", document_content)
|
||||
if matching_model.is_insensitive:
|
||||
match = match.lower()
|
||||
text = text.lower()
|
||||
if fuzz.partial_ratio(match, text, score_cutoff=90):
|
||||
# TODO: make this better
|
||||
log_reason(
|
||||
matching_model,
|
||||
document,
|
||||
f"parts of the document content somehow match the string "
|
||||
f"{matching_model.match}",
|
||||
)
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
elif matching_model.matching_algorithm == MatchingModel.MATCH_AUTO:
|
||||
# this is done elsewhere.
|
||||
return False
|
||||
|
||||
else:
|
||||
raise NotImplementedError("Unsupported matching algorithm")
|
||||
|
||||
|
||||
def _split_match(matching_model):
|
||||
"""
|
||||
Splits the match to individual keywords, getting rid of unnecessary
|
||||
spaces and grouping quoted words together.
|
||||
|
||||
Example:
|
||||
' some random words "with quotes " and spaces'
|
||||
==>
|
||||
["some", "random", "words", "with+quotes", "and", "spaces"]
|
||||
"""
|
||||
findterms = re.compile(r'"([^"]+)"|(\S+)').findall
|
||||
normspace = re.compile(r"\s+").sub
|
||||
return [
|
||||
# normspace(" ", (t[0] or t[1]).strip()).replace(" ", r"\s+")
|
||||
re.escape(normspace(" ", (t[0] or t[1]).strip())).replace(r"\ ", r"\s+")
|
||||
for t in findterms(matching_model.match)
|
||||
]
|
||||
|
||||
|
||||
def consumable_document_matches_workflow(
|
||||
document: ConsumableDocument,
|
||||
trigger: WorkflowTrigger,
|
||||
) -> tuple[bool, str]:
|
||||
"""
|
||||
Returns True if the ConsumableDocument matches all filters from the workflow trigger,
|
||||
False otherwise. Includes a reason if doesn't match
|
||||
"""
|
||||
|
||||
trigger_matched = True
|
||||
reason = ""
|
||||
|
||||
# Document source vs trigger source
|
||||
if len(trigger.sources) > 0 and document.source not in [
|
||||
int(x) for x in list(trigger.sources)
|
||||
]:
|
||||
reason = (
|
||||
f"Document source {document.source.name} not in"
|
||||
f" {[DocumentSource(int(x)).name for x in trigger.sources]}",
|
||||
)
|
||||
trigger_matched = False
|
||||
|
||||
# Document mail rule vs trigger mail rule
|
||||
if (
|
||||
trigger.filter_mailrule is not None
|
||||
and document.mailrule_id != trigger.filter_mailrule.pk
|
||||
):
|
||||
reason = (
|
||||
f"Document mail rule {document.mailrule_id}"
|
||||
f" != {trigger.filter_mailrule.pk}",
|
||||
)
|
||||
trigger_matched = False
|
||||
|
||||
# Document filename vs trigger filename
|
||||
if (
|
||||
trigger.filter_filename is not None
|
||||
and len(trigger.filter_filename) > 0
|
||||
and not fnmatch(
|
||||
document.original_file.name.lower(),
|
||||
trigger.filter_filename.lower(),
|
||||
)
|
||||
):
|
||||
reason = (
|
||||
f"Document filename {document.original_file.name} does not match"
|
||||
f" {trigger.filter_filename.lower()}",
|
||||
)
|
||||
trigger_matched = False
|
||||
|
||||
# Document path vs trigger path
|
||||
|
||||
# Use the original_path if set, else us the original_file
|
||||
match_against = (
|
||||
document.original_path
|
||||
if document.original_path is not None
|
||||
else document.original_file
|
||||
)
|
||||
|
||||
if (
|
||||
trigger.filter_path is not None
|
||||
and len(trigger.filter_path) > 0
|
||||
and not fnmatch(
|
||||
match_against,
|
||||
trigger.filter_path,
|
||||
)
|
||||
):
|
||||
reason = (
|
||||
f"Document path {document.original_file}"
|
||||
f" does not match {trigger.filter_path}",
|
||||
)
|
||||
trigger_matched = False
|
||||
|
||||
return (trigger_matched, reason)
|
||||
|
||||
|
||||
def existing_document_matches_workflow(
|
||||
document: Document,
|
||||
trigger: WorkflowTrigger,
|
||||
) -> tuple[bool, str]:
|
||||
"""
|
||||
Returns True if the Document matches all filters from the workflow trigger,
|
||||
False otherwise. Includes a reason if doesn't match
|
||||
"""
|
||||
|
||||
trigger_matched = True
|
||||
reason = ""
|
||||
|
||||
if trigger.matching_algorithm > MatchingModel.MATCH_NONE and not matches(
|
||||
trigger,
|
||||
document,
|
||||
):
|
||||
reason = (
|
||||
f"Document content matching settings for algorithm '{trigger.matching_algorithm}' did not match",
|
||||
)
|
||||
trigger_matched = False
|
||||
|
||||
# Document tags vs trigger has_tags
|
||||
if (
|
||||
trigger.filter_has_tags.all().count() > 0
|
||||
and document.tags.filter(
|
||||
id__in=trigger.filter_has_tags.all().values_list("id"),
|
||||
).count()
|
||||
== 0
|
||||
):
|
||||
reason = (
|
||||
f"Document tags {document.tags.all()} do not include"
|
||||
f" {trigger.filter_has_tags.all()}",
|
||||
)
|
||||
trigger_matched = False
|
||||
|
||||
# Document correspondent vs trigger has_correspondent
|
||||
if (
|
||||
trigger.filter_has_correspondent is not None
|
||||
and document.correspondent != trigger.filter_has_correspondent
|
||||
):
|
||||
reason = (
|
||||
f"Document correspondent {document.correspondent} does not match {trigger.filter_has_correspondent}",
|
||||
)
|
||||
trigger_matched = False
|
||||
|
||||
# Document document_type vs trigger has_document_type
|
||||
if (
|
||||
trigger.filter_has_document_type is not None
|
||||
and document.document_type != trigger.filter_has_document_type
|
||||
):
|
||||
reason = (
|
||||
f"Document doc type {document.document_type} does not match {trigger.filter_has_document_type}",
|
||||
)
|
||||
trigger_matched = False
|
||||
|
||||
# Document storage_path vs trigger has_storage_path
|
||||
if (
|
||||
trigger.filter_has_storage_path is not None
|
||||
and document.storage_path != trigger.filter_has_storage_path
|
||||
):
|
||||
reason = (
|
||||
f"Document storage path {document.storage_path} does not match {trigger.filter_has_storage_path}",
|
||||
)
|
||||
trigger_matched = False
|
||||
|
||||
# Document original_filename vs trigger filename
|
||||
if (
|
||||
trigger.filter_filename is not None
|
||||
and len(trigger.filter_filename) > 0
|
||||
and document.original_filename is not None
|
||||
and not fnmatch(
|
||||
document.original_filename.lower(),
|
||||
trigger.filter_filename.lower(),
|
||||
)
|
||||
):
|
||||
reason = (
|
||||
f"Document filename {document.original_filename} does not match"
|
||||
f" {trigger.filter_filename.lower()}",
|
||||
)
|
||||
trigger_matched = False
|
||||
|
||||
return (trigger_matched, reason)
|
||||
|
||||
|
||||
def prefilter_documents_by_workflowtrigger(
|
||||
documents: QuerySet[Document],
|
||||
trigger: WorkflowTrigger,
|
||||
) -> QuerySet[Document]:
|
||||
"""
|
||||
To prevent scheduled workflows checking every document, we prefilter the
|
||||
documents by the workflow trigger filters. This is done before e.g.
|
||||
document_matches_workflow in run_workflows
|
||||
"""
|
||||
|
||||
if trigger.filter_has_tags.all().count() > 0:
|
||||
documents = documents.filter(
|
||||
tags__in=trigger.filter_has_tags.all(),
|
||||
).distinct()
|
||||
|
||||
if trigger.filter_has_correspondent is not None:
|
||||
documents = documents.filter(
|
||||
correspondent=trigger.filter_has_correspondent,
|
||||
)
|
||||
|
||||
if trigger.filter_has_document_type is not None:
|
||||
documents = documents.filter(
|
||||
document_type=trigger.filter_has_document_type,
|
||||
)
|
||||
|
||||
if trigger.filter_has_storage_path is not None:
|
||||
documents = documents.filter(
|
||||
storage_path=trigger.filter_has_storage_path,
|
||||
)
|
||||
|
||||
if trigger.filter_filename is not None and len(trigger.filter_filename) > 0:
|
||||
# the true fnmatch will actually run later so we just want a loose filter here
|
||||
regex = fnmatch_translate(trigger.filter_filename).lstrip("^").rstrip("$")
|
||||
regex = f"(?i){regex}"
|
||||
documents = documents.filter(original_filename__regex=regex)
|
||||
|
||||
return documents
|
||||
|
||||
|
||||
def document_matches_workflow(
|
||||
document: ConsumableDocument | Document,
|
||||
workflow: Workflow,
|
||||
trigger_type: WorkflowTrigger.WorkflowTriggerType,
|
||||
) -> bool:
|
||||
"""
|
||||
Returns True if the ConsumableDocument or Document matches all filters and
|
||||
settings from the workflow trigger, False otherwise
|
||||
"""
|
||||
|
||||
trigger_matched = True
|
||||
if workflow.triggers.filter(type=trigger_type).count() == 0:
|
||||
trigger_matched = False
|
||||
logger.info(f"Document did not match {workflow}")
|
||||
logger.debug(f"No matching triggers with type {trigger_type} found")
|
||||
else:
|
||||
for trigger in workflow.triggers.filter(type=trigger_type):
|
||||
if trigger_type == WorkflowTrigger.WorkflowTriggerType.CONSUMPTION:
|
||||
trigger_matched, reason = consumable_document_matches_workflow(
|
||||
document,
|
||||
trigger,
|
||||
)
|
||||
elif (
|
||||
trigger_type == WorkflowTrigger.WorkflowTriggerType.DOCUMENT_ADDED
|
||||
or trigger_type == WorkflowTrigger.WorkflowTriggerType.DOCUMENT_UPDATED
|
||||
or trigger_type == WorkflowTrigger.WorkflowTriggerType.SCHEDULED
|
||||
):
|
||||
trigger_matched, reason = existing_document_matches_workflow(
|
||||
document,
|
||||
trigger,
|
||||
)
|
||||
else:
|
||||
# New trigger types need to be explicitly checked above
|
||||
raise Exception(f"Trigger type {trigger_type} not yet supported")
|
||||
|
||||
if trigger_matched:
|
||||
logger.info(f"Document matched {trigger} from {workflow}")
|
||||
# matched, bail early
|
||||
return True
|
||||
else:
|
||||
logger.info(f"Document did not match {workflow}")
|
||||
logger.debug(reason)
|
||||
|
||||
return trigger_matched
|
||||
40
src/documents/migrations/0001_initial.py
Normal file
40
src/documents/migrations/0001_initial.py
Normal file
@@ -0,0 +1,40 @@
|
||||
# Generated by Django 1.9 on 2015-12-20 19:10
|
||||
|
||||
from django.conf import settings
|
||||
from django.db import migrations
|
||||
from django.db import models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
initial = True
|
||||
|
||||
dependencies = []
|
||||
|
||||
operations = [
|
||||
migrations.CreateModel(
|
||||
name="Document",
|
||||
fields=[
|
||||
(
|
||||
"id",
|
||||
models.AutoField(
|
||||
auto_created=True,
|
||||
primary_key=True,
|
||||
serialize=False,
|
||||
verbose_name="ID",
|
||||
),
|
||||
),
|
||||
("sender", models.CharField(blank=True, db_index=True, max_length=128)),
|
||||
("title", models.CharField(blank=True, db_index=True, max_length=128)),
|
||||
(
|
||||
"content",
|
||||
models.TextField(
|
||||
db_index=(
|
||||
"mysql" not in settings.DATABASES["default"]["ENGINE"]
|
||||
),
|
||||
),
|
||||
),
|
||||
("created", models.DateTimeField(auto_now_add=True)),
|
||||
("modified", models.DateTimeField(auto_now=True)),
|
||||
],
|
||||
),
|
||||
]
|
||||
26
src/documents/migrations/0002_auto_20151226_1316.py
Normal file
26
src/documents/migrations/0002_auto_20151226_1316.py
Normal file
@@ -0,0 +1,26 @@
|
||||
# Generated by Django 1.9 on 2015-12-26 13:16
|
||||
|
||||
import django.utils.timezone
|
||||
from django.db import migrations
|
||||
from django.db import models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
("documents", "0001_initial"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterModelOptions(
|
||||
name="document",
|
||||
options={"ordering": ("sender", "title")},
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="document",
|
||||
name="created",
|
||||
field=models.DateTimeField(
|
||||
default=django.utils.timezone.now,
|
||||
editable=False,
|
||||
),
|
||||
),
|
||||
]
|
||||
70
src/documents/migrations/0003_sender.py
Normal file
70
src/documents/migrations/0003_sender.py
Normal file
@@ -0,0 +1,70 @@
|
||||
# Generated by Django 1.9 on 2016-01-11 12:21
|
||||
|
||||
import django.db.models.deletion
|
||||
from django.db import migrations
|
||||
from django.db import models
|
||||
from django.template.defaultfilters import slugify
|
||||
|
||||
DOCUMENT_SENDER_MAP = {}
|
||||
|
||||
|
||||
def move_sender_strings_to_sender_model(apps, schema_editor):
|
||||
sender_model = apps.get_model("documents", "Sender")
|
||||
document_model = apps.get_model("documents", "Document")
|
||||
|
||||
# Create the sender and log the relationship with the document
|
||||
for document in document_model.objects.all():
|
||||
if document.sender:
|
||||
(
|
||||
DOCUMENT_SENDER_MAP[document.pk],
|
||||
_,
|
||||
) = sender_model.objects.get_or_create(
|
||||
name=document.sender,
|
||||
defaults={"slug": slugify(document.sender)},
|
||||
)
|
||||
|
||||
|
||||
def realign_senders(apps, schema_editor):
|
||||
document_model = apps.get_model("documents", "Document")
|
||||
for pk, sender in DOCUMENT_SENDER_MAP.items():
|
||||
document_model.objects.filter(pk=pk).update(sender=sender)
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
("documents", "0002_auto_20151226_1316"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.CreateModel(
|
||||
name="Sender",
|
||||
fields=[
|
||||
(
|
||||
"id",
|
||||
models.AutoField(
|
||||
auto_created=True,
|
||||
primary_key=True,
|
||||
serialize=False,
|
||||
verbose_name="ID",
|
||||
),
|
||||
),
|
||||
("name", models.CharField(max_length=128, unique=True)),
|
||||
("slug", models.SlugField()),
|
||||
],
|
||||
),
|
||||
migrations.RunPython(move_sender_strings_to_sender_model),
|
||||
migrations.RemoveField(
|
||||
model_name="document",
|
||||
name="sender",
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name="document",
|
||||
name="sender",
|
||||
field=models.ForeignKey(
|
||||
blank=True,
|
||||
on_delete=django.db.models.deletion.CASCADE,
|
||||
to="documents.Sender",
|
||||
),
|
||||
),
|
||||
migrations.RunPython(realign_senders),
|
||||
]
|
||||
25
src/documents/migrations/0004_auto_20160114_1844.py
Normal file
25
src/documents/migrations/0004_auto_20160114_1844.py
Normal file
@@ -0,0 +1,25 @@
|
||||
# Generated by Django 1.9 on 2016-01-14 18:44
|
||||
|
||||
import django.db.models.deletion
|
||||
from django.db import migrations
|
||||
from django.db import models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
("documents", "0003_sender"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name="document",
|
||||
name="sender",
|
||||
field=models.ForeignKey(
|
||||
blank=True,
|
||||
null=True,
|
||||
on_delete=django.db.models.deletion.CASCADE,
|
||||
related_name="documents",
|
||||
to="documents.Sender",
|
||||
),
|
||||
),
|
||||
]
|
||||
@@ -0,0 +1,178 @@
|
||||
# Generated by Django 4.2.13 on 2024-06-28 17:52
|
||||
|
||||
import django.db.models.deletion
|
||||
from django.db import migrations
|
||||
from django.db import models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
replaces = [
|
||||
("documents", "0004_auto_20160114_1844"),
|
||||
("documents", "0005_auto_20160123_0313"),
|
||||
("documents", "0006_auto_20160123_0430"),
|
||||
("documents", "0007_auto_20160126_2114"),
|
||||
("documents", "0008_document_file_type"),
|
||||
("documents", "0009_auto_20160214_0040"),
|
||||
("documents", "0010_log"),
|
||||
("documents", "0011_auto_20160303_1929"),
|
||||
]
|
||||
|
||||
dependencies = [
|
||||
("documents", "0003_sender"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name="document",
|
||||
name="sender",
|
||||
field=models.ForeignKey(
|
||||
blank=True,
|
||||
null=True,
|
||||
on_delete=django.db.models.deletion.CASCADE,
|
||||
related_name="documents",
|
||||
to="documents.sender",
|
||||
),
|
||||
),
|
||||
migrations.AlterModelOptions(
|
||||
name="sender",
|
||||
options={"ordering": ("name",)},
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name="Tag",
|
||||
fields=[
|
||||
(
|
||||
"id",
|
||||
models.AutoField(
|
||||
auto_created=True,
|
||||
primary_key=True,
|
||||
serialize=False,
|
||||
verbose_name="ID",
|
||||
),
|
||||
),
|
||||
("name", models.CharField(max_length=128, unique=True)),
|
||||
("slug", models.SlugField(blank=True)),
|
||||
(
|
||||
"colour",
|
||||
models.PositiveIntegerField(
|
||||
choices=[
|
||||
(1, "#a6cee3"),
|
||||
(2, "#1f78b4"),
|
||||
(3, "#b2df8a"),
|
||||
(4, "#33a02c"),
|
||||
(5, "#fb9a99"),
|
||||
(6, "#e31a1c"),
|
||||
(7, "#fdbf6f"),
|
||||
(8, "#ff7f00"),
|
||||
(9, "#cab2d6"),
|
||||
(10, "#6a3d9a"),
|
||||
(11, "#b15928"),
|
||||
(12, "#000000"),
|
||||
(13, "#cccccc"),
|
||||
],
|
||||
default=1,
|
||||
),
|
||||
),
|
||||
("match", models.CharField(blank=True, max_length=256)),
|
||||
(
|
||||
"matching_algorithm",
|
||||
models.PositiveIntegerField(
|
||||
choices=[
|
||||
(1, "Any"),
|
||||
(2, "All"),
|
||||
(3, "Literal"),
|
||||
(4, "Regular Expression"),
|
||||
],
|
||||
default=1,
|
||||
help_text='Which algorithm you want to use when matching text to the OCR\'d PDF. Here, "any" looks for any occurrence of any word provided in the PDF, while "all" requires that every word provided appear in the PDF, albeit not in the order provided. A "literal" match means that the text you enter must appear in the PDF exactly as you\'ve entered it, and "regular expression" uses a regex to match the PDF. If you don\'t know what a regex is, you probably don\'t want this option.',
|
||||
),
|
||||
),
|
||||
],
|
||||
options={
|
||||
"abstract": False,
|
||||
},
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="sender",
|
||||
name="slug",
|
||||
field=models.SlugField(blank=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name="document",
|
||||
name="file_type",
|
||||
field=models.CharField(
|
||||
choices=[
|
||||
("pdf", "PDF"),
|
||||
("png", "PNG"),
|
||||
("jpg", "JPG"),
|
||||
("gif", "GIF"),
|
||||
("tiff", "TIFF"),
|
||||
],
|
||||
default="pdf",
|
||||
editable=False,
|
||||
max_length=4,
|
||||
),
|
||||
preserve_default=False,
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name="document",
|
||||
name="tags",
|
||||
field=models.ManyToManyField(
|
||||
blank=True,
|
||||
related_name="documents",
|
||||
to="documents.tag",
|
||||
),
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name="Log",
|
||||
fields=[
|
||||
(
|
||||
"id",
|
||||
models.AutoField(
|
||||
auto_created=True,
|
||||
primary_key=True,
|
||||
serialize=False,
|
||||
verbose_name="ID",
|
||||
),
|
||||
),
|
||||
("group", models.UUIDField(blank=True)),
|
||||
("message", models.TextField()),
|
||||
(
|
||||
"level",
|
||||
models.PositiveIntegerField(
|
||||
choices=[
|
||||
(10, "Debugging"),
|
||||
(20, "Informational"),
|
||||
(30, "Warning"),
|
||||
(40, "Error"),
|
||||
(50, "Critical"),
|
||||
],
|
||||
default=20,
|
||||
),
|
||||
),
|
||||
(
|
||||
"component",
|
||||
models.PositiveIntegerField(
|
||||
choices=[(1, "Consumer"), (2, "Mail Fetcher")],
|
||||
),
|
||||
),
|
||||
("created", models.DateTimeField(auto_now_add=True)),
|
||||
("modified", models.DateTimeField(auto_now=True)),
|
||||
],
|
||||
options={
|
||||
"ordering": ("-modified",),
|
||||
},
|
||||
),
|
||||
migrations.RenameModel(
|
||||
old_name="Sender",
|
||||
new_name="Correspondent",
|
||||
),
|
||||
migrations.AlterModelOptions(
|
||||
name="document",
|
||||
options={"ordering": ("correspondent", "title")},
|
||||
),
|
||||
migrations.RenameField(
|
||||
model_name="document",
|
||||
old_name="sender",
|
||||
new_name="correspondent",
|
||||
),
|
||||
]
|
||||
16
src/documents/migrations/0005_auto_20160123_0313.py
Normal file
16
src/documents/migrations/0005_auto_20160123_0313.py
Normal file
@@ -0,0 +1,16 @@
|
||||
# Generated by Django 1.9 on 2016-01-23 03:13
|
||||
|
||||
from django.db import migrations
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
("documents", "0004_auto_20160114_1844"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterModelOptions(
|
||||
name="sender",
|
||||
options={"ordering": ("name",)},
|
||||
),
|
||||
]
|
||||
64
src/documents/migrations/0006_auto_20160123_0430.py
Normal file
64
src/documents/migrations/0006_auto_20160123_0430.py
Normal file
@@ -0,0 +1,64 @@
|
||||
# Generated by Django 1.9 on 2016-01-23 04:30
|
||||
|
||||
from django.db import migrations
|
||||
from django.db import models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
("documents", "0005_auto_20160123_0313"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.CreateModel(
|
||||
name="Tag",
|
||||
fields=[
|
||||
(
|
||||
"id",
|
||||
models.AutoField(
|
||||
auto_created=True,
|
||||
primary_key=True,
|
||||
serialize=False,
|
||||
verbose_name="ID",
|
||||
),
|
||||
),
|
||||
("name", models.CharField(max_length=128, unique=True)),
|
||||
("slug", models.SlugField(blank=True)),
|
||||
(
|
||||
"colour",
|
||||
models.PositiveIntegerField(
|
||||
choices=[
|
||||
(1, "#a6cee3"),
|
||||
(2, "#1f78b4"),
|
||||
(3, "#b2df8a"),
|
||||
(4, "#33a02c"),
|
||||
(5, "#fb9a99"),
|
||||
(6, "#e31a1c"),
|
||||
(7, "#fdbf6f"),
|
||||
(8, "#ff7f00"),
|
||||
(9, "#cab2d6"),
|
||||
(10, "#6a3d9a"),
|
||||
(11, "#ffff99"),
|
||||
(12, "#b15928"),
|
||||
(13, "#000000"),
|
||||
(14, "#cccccc"),
|
||||
],
|
||||
default=1,
|
||||
),
|
||||
),
|
||||
],
|
||||
options={
|
||||
"abstract": False,
|
||||
},
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="sender",
|
||||
name="slug",
|
||||
field=models.SlugField(blank=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name="document",
|
||||
name="tags",
|
||||
field=models.ManyToManyField(related_name="documents", to="documents.Tag"),
|
||||
),
|
||||
]
|
||||
55
src/documents/migrations/0007_auto_20160126_2114.py
Normal file
55
src/documents/migrations/0007_auto_20160126_2114.py
Normal file
@@ -0,0 +1,55 @@
|
||||
# Generated by Django 1.9 on 2016-01-26 21:14
|
||||
|
||||
from django.db import migrations
|
||||
from django.db import models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
("documents", "0006_auto_20160123_0430"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AddField(
|
||||
model_name="tag",
|
||||
name="match",
|
||||
field=models.CharField(blank=True, max_length=256),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name="tag",
|
||||
name="matching_algorithm",
|
||||
field=models.PositiveIntegerField(
|
||||
blank=True,
|
||||
choices=[
|
||||
(1, "Any"),
|
||||
(2, "All"),
|
||||
(3, "Literal"),
|
||||
(4, "Regular Expression"),
|
||||
],
|
||||
help_text='Which algorithm you want to use when matching text to the OCR\'d PDF. Here, "any" looks for any occurrence of any word provided in the PDF, while "all" requires that every word provided appear in the PDF, albeit not in the order provided. A "literal" match means that the text you enter must appear in the PDF exactly as you\'ve entered it, and "regular expression" uses a regex to match the PDF. If you don\'t know what a regex is, you probably don\'t want this option.',
|
||||
null=True,
|
||||
),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="tag",
|
||||
name="colour",
|
||||
field=models.PositiveIntegerField(
|
||||
choices=[
|
||||
(1, "#a6cee3"),
|
||||
(2, "#1f78b4"),
|
||||
(3, "#b2df8a"),
|
||||
(4, "#33a02c"),
|
||||
(5, "#fb9a99"),
|
||||
(6, "#e31a1c"),
|
||||
(7, "#fdbf6f"),
|
||||
(8, "#ff7f00"),
|
||||
(9, "#cab2d6"),
|
||||
(10, "#6a3d9a"),
|
||||
(11, "#b15928"),
|
||||
(12, "#000000"),
|
||||
(13, "#cccccc"),
|
||||
],
|
||||
default=1,
|
||||
),
|
||||
),
|
||||
]
|
||||
39
src/documents/migrations/0008_document_file_type.py
Normal file
39
src/documents/migrations/0008_document_file_type.py
Normal file
@@ -0,0 +1,39 @@
|
||||
# Generated by Django 1.9 on 2016-01-29 22:58
|
||||
|
||||
from django.db import migrations
|
||||
from django.db import models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
("documents", "0007_auto_20160126_2114"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AddField(
|
||||
model_name="document",
|
||||
name="file_type",
|
||||
field=models.CharField(
|
||||
choices=[
|
||||
("pdf", "PDF"),
|
||||
("png", "PNG"),
|
||||
("jpg", "JPG"),
|
||||
("gif", "GIF"),
|
||||
("tiff", "TIFF"),
|
||||
],
|
||||
default="pdf",
|
||||
editable=False,
|
||||
max_length=4,
|
||||
),
|
||||
preserve_default=False,
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="document",
|
||||
name="tags",
|
||||
field=models.ManyToManyField(
|
||||
blank=True,
|
||||
related_name="documents",
|
||||
to="documents.Tag",
|
||||
),
|
||||
),
|
||||
]
|
||||
27
src/documents/migrations/0009_auto_20160214_0040.py
Normal file
27
src/documents/migrations/0009_auto_20160214_0040.py
Normal file
@@ -0,0 +1,27 @@
|
||||
# Generated by Django 1.9 on 2016-02-14 00:40
|
||||
|
||||
from django.db import migrations
|
||||
from django.db import models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
("documents", "0008_document_file_type"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name="tag",
|
||||
name="matching_algorithm",
|
||||
field=models.PositiveIntegerField(
|
||||
choices=[
|
||||
(1, "Any"),
|
||||
(2, "All"),
|
||||
(3, "Literal"),
|
||||
(4, "Regular Expression"),
|
||||
],
|
||||
default=1,
|
||||
help_text='Which algorithm you want to use when matching text to the OCR\'d PDF. Here, "any" looks for any occurrence of any word provided in the PDF, while "all" requires that every word provided appear in the PDF, albeit not in the order provided. A "literal" match means that the text you enter must appear in the PDF exactly as you\'ve entered it, and "regular expression" uses a regex to match the PDF. If you don\'t know what a regex is, you probably don\'t want this option.',
|
||||
),
|
||||
),
|
||||
]
|
||||
53
src/documents/migrations/0010_log.py
Normal file
53
src/documents/migrations/0010_log.py
Normal file
@@ -0,0 +1,53 @@
|
||||
# Generated by Django 1.9 on 2016-02-27 17:54
|
||||
|
||||
from django.db import migrations
|
||||
from django.db import models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
("documents", "0009_auto_20160214_0040"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.CreateModel(
|
||||
name="Log",
|
||||
fields=[
|
||||
(
|
||||
"id",
|
||||
models.AutoField(
|
||||
auto_created=True,
|
||||
primary_key=True,
|
||||
serialize=False,
|
||||
verbose_name="ID",
|
||||
),
|
||||
),
|
||||
("group", models.UUIDField(blank=True)),
|
||||
("message", models.TextField()),
|
||||
(
|
||||
"level",
|
||||
models.PositiveIntegerField(
|
||||
choices=[
|
||||
(10, "Debugging"),
|
||||
(20, "Informational"),
|
||||
(30, "Warning"),
|
||||
(40, "Error"),
|
||||
(50, "Critical"),
|
||||
],
|
||||
default=20,
|
||||
),
|
||||
),
|
||||
(
|
||||
"component",
|
||||
models.PositiveIntegerField(
|
||||
choices=[(1, "Consumer"), (2, "Mail Fetcher")],
|
||||
),
|
||||
),
|
||||
("created", models.DateTimeField(auto_now_add=True)),
|
||||
("modified", models.DateTimeField(auto_now=True)),
|
||||
],
|
||||
options={
|
||||
"ordering": ("-modified",),
|
||||
},
|
||||
),
|
||||
]
|
||||
26
src/documents/migrations/0011_auto_20160303_1929.py
Normal file
26
src/documents/migrations/0011_auto_20160303_1929.py
Normal file
@@ -0,0 +1,26 @@
|
||||
# Generated by Django 1.9.2 on 2016-03-03 19:29
|
||||
|
||||
from django.db import migrations
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
atomic = False
|
||||
dependencies = [
|
||||
("documents", "0010_log"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.RenameModel(
|
||||
old_name="Sender",
|
||||
new_name="Correspondent",
|
||||
),
|
||||
migrations.AlterModelOptions(
|
||||
name="document",
|
||||
options={"ordering": ("correspondent", "title")},
|
||||
),
|
||||
migrations.RenameField(
|
||||
model_name="document",
|
||||
old_name="sender",
|
||||
new_name="correspondent",
|
||||
),
|
||||
]
|
||||
128
src/documents/migrations/0012_auto_20160305_0040.py
Normal file
128
src/documents/migrations/0012_auto_20160305_0040.py
Normal file
@@ -0,0 +1,128 @@
|
||||
# Generated by Django 1.9.2 on 2016-03-05 00:40
|
||||
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import subprocess
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
import gnupg
|
||||
from django.conf import settings
|
||||
from django.db import migrations
|
||||
from django.utils.termcolors import colorize as colourise # Spelling hurts me
|
||||
|
||||
|
||||
class GnuPG:
|
||||
"""
|
||||
A handy singleton to use when handling encrypted files.
|
||||
"""
|
||||
|
||||
gpg = gnupg.GPG(gnupghome=settings.GNUPG_HOME)
|
||||
|
||||
@classmethod
|
||||
def decrypted(cls, file_handle):
|
||||
return cls.gpg.decrypt_file(file_handle, passphrase=settings.PASSPHRASE).data
|
||||
|
||||
@classmethod
|
||||
def encrypted(cls, file_handle):
|
||||
return cls.gpg.encrypt_file(
|
||||
file_handle,
|
||||
recipients=None,
|
||||
passphrase=settings.PASSPHRASE,
|
||||
symmetric=True,
|
||||
).data
|
||||
|
||||
|
||||
def move_documents_and_create_thumbnails(apps, schema_editor):
|
||||
(Path(settings.MEDIA_ROOT) / "documents" / "originals").mkdir(
|
||||
parents=True,
|
||||
exist_ok=True,
|
||||
)
|
||||
(Path(settings.MEDIA_ROOT) / "documents" / "thumbnails").mkdir(
|
||||
parents=True,
|
||||
exist_ok=True,
|
||||
)
|
||||
|
||||
documents: list[str] = os.listdir(Path(settings.MEDIA_ROOT) / "documents") # noqa: PTH208
|
||||
|
||||
if set(documents) == {"originals", "thumbnails"}:
|
||||
return
|
||||
|
||||
print(
|
||||
colourise(
|
||||
"\n\n"
|
||||
" This is a one-time only migration to generate thumbnails for all of your\n"
|
||||
" documents so that future UIs will have something to work with. If you have\n"
|
||||
" a lot of documents though, this may take a while, so a coffee break may be\n"
|
||||
" in order."
|
||||
"\n",
|
||||
opts=("bold",),
|
||||
),
|
||||
)
|
||||
|
||||
Path(settings.SCRATCH_DIR).mkdir(parents=True, exist_ok=True)
|
||||
|
||||
for f in sorted(documents):
|
||||
if not f.endswith("gpg"):
|
||||
continue
|
||||
|
||||
print(
|
||||
" {} {} {}".format(
|
||||
colourise("*", fg="green"),
|
||||
colourise("Generating a thumbnail for", fg="white"),
|
||||
colourise(f, fg="cyan"),
|
||||
),
|
||||
)
|
||||
|
||||
thumb_temp: str = tempfile.mkdtemp(prefix="paperless", dir=settings.SCRATCH_DIR)
|
||||
orig_temp: str = tempfile.mkdtemp(prefix="paperless", dir=settings.SCRATCH_DIR)
|
||||
|
||||
orig_source: Path = Path(settings.MEDIA_ROOT) / "documents" / f
|
||||
orig_target: Path = Path(orig_temp) / f.replace(".gpg", "")
|
||||
|
||||
with orig_source.open("rb") as encrypted, orig_target.open("wb") as unencrypted:
|
||||
unencrypted.write(GnuPG.decrypted(encrypted))
|
||||
|
||||
subprocess.Popen(
|
||||
(
|
||||
settings.CONVERT_BINARY,
|
||||
"-scale",
|
||||
"500x5000",
|
||||
"-alpha",
|
||||
"remove",
|
||||
orig_target,
|
||||
Path(thumb_temp) / "convert-%04d.png",
|
||||
),
|
||||
).wait()
|
||||
|
||||
thumb_source: Path = Path(thumb_temp) / "convert-0000.png"
|
||||
thumb_target: Path = (
|
||||
Path(settings.MEDIA_ROOT)
|
||||
/ "documents"
|
||||
/ "thumbnails"
|
||||
/ re.sub(r"(\d+)\.\w+(\.gpg)", "\\1.png\\2", f)
|
||||
)
|
||||
with (
|
||||
thumb_source.open("rb") as unencrypted,
|
||||
thumb_target.open("wb") as encrypted,
|
||||
):
|
||||
encrypted.write(GnuPG.encrypted(unencrypted))
|
||||
|
||||
shutil.rmtree(thumb_temp)
|
||||
shutil.rmtree(orig_temp)
|
||||
|
||||
shutil.move(
|
||||
Path(settings.MEDIA_ROOT) / "documents" / f,
|
||||
Path(settings.MEDIA_ROOT) / "documents" / "originals" / f,
|
||||
)
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
("documents", "0011_auto_20160303_1929"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.RunPython(move_documents_and_create_thumbnails),
|
||||
]
|
||||
42
src/documents/migrations/0013_auto_20160325_2111.py
Normal file
42
src/documents/migrations/0013_auto_20160325_2111.py
Normal file
@@ -0,0 +1,42 @@
|
||||
# Generated by Django 1.9.4 on 2016-03-25 21:11
|
||||
|
||||
import django.utils.timezone
|
||||
from django.db import migrations
|
||||
from django.db import models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
("documents", "0012_auto_20160305_0040"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AddField(
|
||||
model_name="correspondent",
|
||||
name="match",
|
||||
field=models.CharField(blank=True, max_length=256),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name="correspondent",
|
||||
name="matching_algorithm",
|
||||
field=models.PositiveIntegerField(
|
||||
choices=[
|
||||
(1, "Any"),
|
||||
(2, "All"),
|
||||
(3, "Literal"),
|
||||
(4, "Regular Expression"),
|
||||
],
|
||||
default=1,
|
||||
help_text='Which algorithm you want to use when matching text to the OCR\'d PDF. Here, "any" looks for any occurrence of any word provided in the PDF, while "all" requires that every word provided appear in the PDF, albeit not in the order provided. A "literal" match means that the text you enter must appear in the PDF exactly as you\'ve entered it, and "regular expression" uses a regex to match the PDF. If you don\'t know what a regex is, you probably don\'t want this option.',
|
||||
),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="document",
|
||||
name="created",
|
||||
field=models.DateTimeField(default=django.utils.timezone.now),
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name="log",
|
||||
name="component",
|
||||
),
|
||||
]
|
||||
182
src/documents/migrations/0014_document_checksum.py
Normal file
182
src/documents/migrations/0014_document_checksum.py
Normal file
@@ -0,0 +1,182 @@
|
||||
# Generated by Django 1.9.4 on 2016-03-28 19:09
|
||||
|
||||
import hashlib
|
||||
from pathlib import Path
|
||||
|
||||
import django.utils.timezone
|
||||
import gnupg
|
||||
from django.conf import settings
|
||||
from django.db import migrations
|
||||
from django.db import models
|
||||
from django.template.defaultfilters import slugify
|
||||
from django.utils.termcolors import colorize as colourise # Spelling hurts me
|
||||
|
||||
|
||||
class GnuPG:
|
||||
"""
|
||||
A handy singleton to use when handling encrypted files.
|
||||
"""
|
||||
|
||||
gpg = gnupg.GPG(gnupghome=settings.GNUPG_HOME)
|
||||
|
||||
@classmethod
|
||||
def decrypted(cls, file_handle):
|
||||
return cls.gpg.decrypt_file(file_handle, passphrase=settings.PASSPHRASE).data
|
||||
|
||||
@classmethod
|
||||
def encrypted(cls, file_handle):
|
||||
return cls.gpg.encrypt_file(
|
||||
file_handle,
|
||||
recipients=None,
|
||||
passphrase=settings.PASSPHRASE,
|
||||
symmetric=True,
|
||||
).data
|
||||
|
||||
|
||||
class Document:
|
||||
"""
|
||||
Django's migrations restrict access to model methods, so this is a snapshot
|
||||
of the methods that existed at the time this migration was written, since
|
||||
we need to make use of a lot of these shortcuts here.
|
||||
"""
|
||||
|
||||
def __init__(self, doc):
|
||||
self.pk = doc.pk
|
||||
self.correspondent = doc.correspondent
|
||||
self.title = doc.title
|
||||
self.file_type = doc.file_type
|
||||
self.tags = doc.tags
|
||||
self.created = doc.created
|
||||
|
||||
def __str__(self):
|
||||
created = self.created.strftime("%Y%m%d%H%M%S")
|
||||
if self.correspondent and self.title:
|
||||
return f"{created}: {self.correspondent} - {self.title}"
|
||||
if self.correspondent or self.title:
|
||||
return f"{created}: {self.correspondent or self.title}"
|
||||
return str(created)
|
||||
|
||||
@property
|
||||
def source_path(self):
|
||||
return (
|
||||
Path(settings.MEDIA_ROOT)
|
||||
/ "documents"
|
||||
/ "originals"
|
||||
/ f"{self.pk:07}.{self.file_type}.gpg"
|
||||
)
|
||||
|
||||
@property
|
||||
def source_file(self):
|
||||
return self.source_path.open("rb")
|
||||
|
||||
@property
|
||||
def file_name(self):
|
||||
return slugify(str(self)) + "." + self.file_type
|
||||
|
||||
|
||||
def set_checksums(apps, schema_editor):
|
||||
document_model = apps.get_model("documents", "Document")
|
||||
|
||||
if not document_model.objects.all().exists():
|
||||
return
|
||||
|
||||
print(
|
||||
colourise(
|
||||
"\n\n"
|
||||
" This is a one-time only migration to generate checksums for all\n"
|
||||
" of your existing documents. If you have a lot of documents\n"
|
||||
" though, this may take a while, so a coffee break may be in\n"
|
||||
" order."
|
||||
"\n",
|
||||
opts=("bold",),
|
||||
),
|
||||
)
|
||||
|
||||
sums = {}
|
||||
for d in document_model.objects.all():
|
||||
document = Document(d)
|
||||
|
||||
print(
|
||||
" {} {} {}".format(
|
||||
colourise("*", fg="green"),
|
||||
colourise("Generating a checksum for", fg="white"),
|
||||
colourise(document.file_name, fg="cyan"),
|
||||
),
|
||||
)
|
||||
|
||||
with document.source_file as encrypted:
|
||||
checksum = hashlib.md5(GnuPG.decrypted(encrypted)).hexdigest()
|
||||
|
||||
if checksum in sums:
|
||||
error = "\n{line}{p1}\n\n{doc1}\n{doc2}\n\n{p2}\n\n{code}\n\n{p3}{line}".format(
|
||||
p1=colourise(
|
||||
"It appears that you have two identical documents in your collection and \nPaperless no longer supports this (see issue #97). The documents in question\nare:",
|
||||
fg="yellow",
|
||||
),
|
||||
p2=colourise(
|
||||
"To fix this problem, you'll have to remove one of them from the database, a task\nmost easily done by running the following command in the same\ndirectory as manage.py:",
|
||||
fg="yellow",
|
||||
),
|
||||
p3=colourise(
|
||||
"When that's finished, re-run the migrate, and provided that there aren't any\nother duplicates, you should be good to go.",
|
||||
fg="yellow",
|
||||
),
|
||||
doc1=colourise(
|
||||
f" * {sums[checksum][1]} (id: {sums[checksum][0]})",
|
||||
fg="red",
|
||||
),
|
||||
doc2=colourise(
|
||||
f" * {document.file_name} (id: {document.pk})",
|
||||
fg="red",
|
||||
),
|
||||
code=colourise(
|
||||
f" $ echo 'DELETE FROM documents_document WHERE id = {document.pk};' | ./manage.py dbshell",
|
||||
fg="green",
|
||||
),
|
||||
line=colourise("\n{}\n".format("=" * 80), fg="white", opts=("bold",)),
|
||||
)
|
||||
raise RuntimeError(error)
|
||||
sums[checksum] = (document.pk, document.file_name)
|
||||
|
||||
document_model.objects.filter(pk=document.pk).update(checksum=checksum)
|
||||
|
||||
|
||||
def do_nothing(apps, schema_editor):
|
||||
pass
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
("documents", "0013_auto_20160325_2111"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AddField(
|
||||
model_name="document",
|
||||
name="checksum",
|
||||
field=models.CharField(
|
||||
default="-",
|
||||
db_index=True,
|
||||
editable=False,
|
||||
max_length=32,
|
||||
help_text="The checksum of the original document (before it "
|
||||
"was encrypted). We use this to prevent duplicate "
|
||||
"document imports.",
|
||||
),
|
||||
preserve_default=False,
|
||||
),
|
||||
migrations.RunPython(set_checksums, do_nothing),
|
||||
migrations.AlterField(
|
||||
model_name="document",
|
||||
name="created",
|
||||
field=models.DateTimeField(
|
||||
db_index=True,
|
||||
default=django.utils.timezone.now,
|
||||
),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="document",
|
||||
name="modified",
|
||||
field=models.DateTimeField(auto_now=True, db_index=True),
|
||||
),
|
||||
]
|
||||
33
src/documents/migrations/0015_add_insensitive_to_match.py
Normal file
33
src/documents/migrations/0015_add_insensitive_to_match.py
Normal file
@@ -0,0 +1,33 @@
|
||||
# Generated by Django 1.10.2 on 2016-10-05 21:38
|
||||
|
||||
from django.db import migrations
|
||||
from django.db import models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
("documents", "0014_document_checksum"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name="document",
|
||||
name="checksum",
|
||||
field=models.CharField(
|
||||
editable=False,
|
||||
help_text="The checksum of the original document (before it was encrypted). We use this to prevent duplicate document imports.",
|
||||
max_length=32,
|
||||
unique=True,
|
||||
),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name="correspondent",
|
||||
name="is_insensitive",
|
||||
field=models.BooleanField(default=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name="tag",
|
||||
name="is_insensitive",
|
||||
field=models.BooleanField(default=True),
|
||||
),
|
||||
]
|
||||
@@ -0,0 +1,92 @@
|
||||
# Generated by Django 4.2.13 on 2024-06-28 17:57
|
||||
|
||||
import django.db.models.deletion
|
||||
from django.conf import settings
|
||||
from django.db import migrations
|
||||
from django.db import models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
replaces = [
|
||||
("documents", "0015_add_insensitive_to_match"),
|
||||
("documents", "0016_auto_20170325_1558"),
|
||||
("documents", "0017_auto_20170512_0507"),
|
||||
("documents", "0018_auto_20170715_1712"),
|
||||
]
|
||||
|
||||
dependencies = [
|
||||
("documents", "0014_document_checksum"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name="document",
|
||||
name="checksum",
|
||||
field=models.CharField(
|
||||
editable=False,
|
||||
help_text="The checksum of the original document (before it was encrypted). We use this to prevent duplicate document imports.",
|
||||
max_length=32,
|
||||
unique=True,
|
||||
),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name="correspondent",
|
||||
name="is_insensitive",
|
||||
field=models.BooleanField(default=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name="tag",
|
||||
name="is_insensitive",
|
||||
field=models.BooleanField(default=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="document",
|
||||
name="content",
|
||||
field=models.TextField(
|
||||
blank=True,
|
||||
db_index=("mysql" not in settings.DATABASES["default"]["ENGINE"]),
|
||||
help_text="The raw, text-only data of the document. This field is primarily used for searching.",
|
||||
),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="correspondent",
|
||||
name="matching_algorithm",
|
||||
field=models.PositiveIntegerField(
|
||||
choices=[
|
||||
(1, "Any"),
|
||||
(2, "All"),
|
||||
(3, "Literal"),
|
||||
(4, "Regular Expression"),
|
||||
(5, "Fuzzy Match"),
|
||||
],
|
||||
default=1,
|
||||
help_text='Which algorithm you want to use when matching text to the OCR\'d PDF. Here, "any" looks for any occurrence of any word provided in the PDF, while "all" requires that every word provided appear in the PDF, albeit not in the order provided. A "literal" match means that the text you enter must appear in the PDF exactly as you\'ve entered it, and "regular expression" uses a regex to match the PDF. (If you don\'t know what a regex is, you probably don\'t want this option.) Finally, a "fuzzy match" looks for words or phrases that are mostly—but not exactly—the same, which can be useful for matching against documents containing imperfections that foil accurate OCR.',
|
||||
),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="tag",
|
||||
name="matching_algorithm",
|
||||
field=models.PositiveIntegerField(
|
||||
choices=[
|
||||
(1, "Any"),
|
||||
(2, "All"),
|
||||
(3, "Literal"),
|
||||
(4, "Regular Expression"),
|
||||
(5, "Fuzzy Match"),
|
||||
],
|
||||
default=1,
|
||||
help_text='Which algorithm you want to use when matching text to the OCR\'d PDF. Here, "any" looks for any occurrence of any word provided in the PDF, while "all" requires that every word provided appear in the PDF, albeit not in the order provided. A "literal" match means that the text you enter must appear in the PDF exactly as you\'ve entered it, and "regular expression" uses a regex to match the PDF. (If you don\'t know what a regex is, you probably don\'t want this option.) Finally, a "fuzzy match" looks for words or phrases that are mostly—but not exactly—the same, which can be useful for matching against documents containing imperfections that foil accurate OCR.',
|
||||
),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="document",
|
||||
name="correspondent",
|
||||
field=models.ForeignKey(
|
||||
blank=True,
|
||||
null=True,
|
||||
on_delete=django.db.models.deletion.SET_NULL,
|
||||
related_name="documents",
|
||||
to="documents.correspondent",
|
||||
),
|
||||
),
|
||||
]
|
||||
23
src/documents/migrations/0016_auto_20170325_1558.py
Normal file
23
src/documents/migrations/0016_auto_20170325_1558.py
Normal file
@@ -0,0 +1,23 @@
|
||||
# Generated by Django 1.10.5 on 2017-03-25 15:58
|
||||
|
||||
from django.conf import settings
|
||||
from django.db import migrations
|
||||
from django.db import models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
("documents", "0015_add_insensitive_to_match"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name="document",
|
||||
name="content",
|
||||
field=models.TextField(
|
||||
blank=True,
|
||||
db_index=("mysql" not in settings.DATABASES["default"]["ENGINE"]),
|
||||
help_text="The raw, text-only data of the document. This field is primarily used for searching.",
|
||||
),
|
||||
),
|
||||
]
|
||||
43
src/documents/migrations/0017_auto_20170512_0507.py
Normal file
43
src/documents/migrations/0017_auto_20170512_0507.py
Normal file
@@ -0,0 +1,43 @@
|
||||
# Generated by Django 1.10.5 on 2017-05-12 05:07
|
||||
|
||||
from django.db import migrations
|
||||
from django.db import models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
("documents", "0016_auto_20170325_1558"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name="correspondent",
|
||||
name="matching_algorithm",
|
||||
field=models.PositiveIntegerField(
|
||||
choices=[
|
||||
(1, "Any"),
|
||||
(2, "All"),
|
||||
(3, "Literal"),
|
||||
(4, "Regular Expression"),
|
||||
(5, "Fuzzy Match"),
|
||||
],
|
||||
default=1,
|
||||
help_text='Which algorithm you want to use when matching text to the OCR\'d PDF. Here, "any" looks for any occurrence of any word provided in the PDF, while "all" requires that every word provided appear in the PDF, albeit not in the order provided. A "literal" match means that the text you enter must appear in the PDF exactly as you\'ve entered it, and "regular expression" uses a regex to match the PDF. (If you don\'t know what a regex is, you probably don\'t want this option.) Finally, a "fuzzy match" looks for words or phrases that are mostly—but not exactly—the same, which can be useful for matching against documents containing imperfections that foil accurate OCR.',
|
||||
),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="tag",
|
||||
name="matching_algorithm",
|
||||
field=models.PositiveIntegerField(
|
||||
choices=[
|
||||
(1, "Any"),
|
||||
(2, "All"),
|
||||
(3, "Literal"),
|
||||
(4, "Regular Expression"),
|
||||
(5, "Fuzzy Match"),
|
||||
],
|
||||
default=1,
|
||||
help_text='Which algorithm you want to use when matching text to the OCR\'d PDF. Here, "any" looks for any occurrence of any word provided in the PDF, while "all" requires that every word provided appear in the PDF, albeit not in the order provided. A "literal" match means that the text you enter must appear in the PDF exactly as you\'ve entered it, and "regular expression" uses a regex to match the PDF. (If you don\'t know what a regex is, you probably don\'t want this option.) Finally, a "fuzzy match" looks for words or phrases that are mostly—but not exactly—the same, which can be useful for matching against documents containing imperfections that foil accurate OCR.',
|
||||
),
|
||||
),
|
||||
]
|
||||
25
src/documents/migrations/0018_auto_20170715_1712.py
Normal file
25
src/documents/migrations/0018_auto_20170715_1712.py
Normal file
@@ -0,0 +1,25 @@
|
||||
# Generated by Django 1.10.5 on 2017-07-15 17:12
|
||||
|
||||
import django.db.models.deletion
|
||||
from django.db import migrations
|
||||
from django.db import models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
("documents", "0017_auto_20170512_0507"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name="document",
|
||||
name="correspondent",
|
||||
field=models.ForeignKey(
|
||||
blank=True,
|
||||
null=True,
|
||||
on_delete=django.db.models.deletion.SET_NULL,
|
||||
related_name="documents",
|
||||
to="documents.Correspondent",
|
||||
),
|
||||
),
|
||||
]
|
||||
22
src/documents/migrations/0019_add_consumer_user.py
Normal file
22
src/documents/migrations/0019_add_consumer_user.py
Normal file
@@ -0,0 +1,22 @@
|
||||
# Generated by Django 1.10.5 on 2017-07-15 17:12
|
||||
|
||||
from django.contrib.auth.models import User
|
||||
from django.db import migrations
|
||||
|
||||
|
||||
def forwards_func(apps, schema_editor):
|
||||
User.objects.create(username="consumer")
|
||||
|
||||
|
||||
def reverse_func(apps, schema_editor):
|
||||
User.objects.get(username="consumer").delete()
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
("documents", "0018_auto_20170715_1712"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.RunPython(forwards_func, reverse_func),
|
||||
]
|
||||
29
src/documents/migrations/0020_document_added.py
Normal file
29
src/documents/migrations/0020_document_added.py
Normal file
@@ -0,0 +1,29 @@
|
||||
import django.utils.timezone
|
||||
from django.db import migrations
|
||||
from django.db import models
|
||||
|
||||
|
||||
def set_added_time_to_created_time(apps, schema_editor):
|
||||
Document = apps.get_model("documents", "Document")
|
||||
for doc in Document.objects.all():
|
||||
doc.added = doc.created
|
||||
doc.save()
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
("documents", "0019_add_consumer_user"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AddField(
|
||||
model_name="document",
|
||||
name="added",
|
||||
field=models.DateTimeField(
|
||||
db_index=True,
|
||||
default=django.utils.timezone.now,
|
||||
editable=False,
|
||||
),
|
||||
),
|
||||
migrations.RunPython(set_added_time_to_created_time),
|
||||
]
|
||||
41
src/documents/migrations/0021_document_storage_type.py
Normal file
41
src/documents/migrations/0021_document_storage_type.py
Normal file
@@ -0,0 +1,41 @@
|
||||
# Generated by Django 1.11.10 on 2018-02-04 13:07
|
||||
|
||||
from django.db import migrations
|
||||
from django.db import models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
("documents", "0020_document_added"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
# Add the field with the default GPG-encrypted value
|
||||
migrations.AddField(
|
||||
model_name="document",
|
||||
name="storage_type",
|
||||
field=models.CharField(
|
||||
choices=[
|
||||
("unencrypted", "Unencrypted"),
|
||||
("gpg", "Encrypted with GNU Privacy Guard"),
|
||||
],
|
||||
default="gpg",
|
||||
editable=False,
|
||||
max_length=11,
|
||||
),
|
||||
),
|
||||
# Now that the field is added, change the default to unencrypted
|
||||
migrations.AlterField(
|
||||
model_name="document",
|
||||
name="storage_type",
|
||||
field=models.CharField(
|
||||
choices=[
|
||||
("unencrypted", "Unencrypted"),
|
||||
("gpg", "Encrypted with GNU Privacy Guard"),
|
||||
],
|
||||
default="unencrypted",
|
||||
editable=False,
|
||||
max_length=11,
|
||||
),
|
||||
),
|
||||
]
|
||||
61
src/documents/migrations/0022_auto_20181007_1420.py
Normal file
61
src/documents/migrations/0022_auto_20181007_1420.py
Normal file
@@ -0,0 +1,61 @@
|
||||
# Generated by Django 2.0.8 on 2018-10-07 14:20
|
||||
|
||||
from django.db import migrations
|
||||
from django.db import models
|
||||
from django.utils.text import slugify
|
||||
|
||||
|
||||
def re_slug_all_the_things(apps, schema_editor):
|
||||
"""
|
||||
Rewrite all slug values to make sure they're actually slugs before we brand
|
||||
them as uneditable.
|
||||
"""
|
||||
|
||||
Tag = apps.get_model("documents", "Tag")
|
||||
Correspondent = apps.get_model("documents", "Correspondent")
|
||||
|
||||
for klass in (Tag, Correspondent):
|
||||
for instance in klass.objects.all():
|
||||
klass.objects.filter(pk=instance.pk).update(slug=slugify(instance.slug))
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
("documents", "0021_document_storage_type"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterModelOptions(
|
||||
name="tag",
|
||||
options={"ordering": ("name",)},
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="correspondent",
|
||||
name="slug",
|
||||
field=models.SlugField(blank=True, editable=False),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="document",
|
||||
name="file_type",
|
||||
field=models.CharField(
|
||||
choices=[
|
||||
("pdf", "PDF"),
|
||||
("png", "PNG"),
|
||||
("jpg", "JPG"),
|
||||
("gif", "GIF"),
|
||||
("tiff", "TIFF"),
|
||||
("txt", "TXT"),
|
||||
("csv", "CSV"),
|
||||
("md", "MD"),
|
||||
],
|
||||
editable=False,
|
||||
max_length=4,
|
||||
),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="tag",
|
||||
name="slug",
|
||||
field=models.SlugField(blank=True, editable=False),
|
||||
),
|
||||
migrations.RunPython(re_slug_all_the_things, migrations.RunPython.noop),
|
||||
]
|
||||
39
src/documents/migrations/0023_document_current_filename.py
Normal file
39
src/documents/migrations/0023_document_current_filename.py
Normal file
@@ -0,0 +1,39 @@
|
||||
# Generated by Django 2.0.10 on 2019-04-26 18:57
|
||||
|
||||
from django.db import migrations
|
||||
from django.db import models
|
||||
|
||||
|
||||
def set_filename(apps, schema_editor):
|
||||
Document = apps.get_model("documents", "Document")
|
||||
for doc in Document.objects.all():
|
||||
file_name = f"{doc.pk:07}.{doc.file_type}"
|
||||
if doc.storage_type == "gpg":
|
||||
file_name += ".gpg"
|
||||
|
||||
# Set filename
|
||||
doc.filename = file_name
|
||||
|
||||
# Save document
|
||||
doc.save()
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
("documents", "0022_auto_20181007_1420"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AddField(
|
||||
model_name="document",
|
||||
name="filename",
|
||||
field=models.FilePathField(
|
||||
default=None,
|
||||
null=True,
|
||||
editable=False,
|
||||
help_text="Current filename in storage",
|
||||
max_length=256,
|
||||
),
|
||||
),
|
||||
migrations.RunPython(set_filename),
|
||||
]
|
||||
147
src/documents/migrations/1000_update_paperless_all.py
Normal file
147
src/documents/migrations/1000_update_paperless_all.py
Normal file
@@ -0,0 +1,147 @@
|
||||
# Generated by Django 3.1.3 on 2020-11-07 12:35
|
||||
import uuid
|
||||
|
||||
import django.db.models.deletion
|
||||
from django.db import migrations
|
||||
from django.db import models
|
||||
|
||||
|
||||
def logs_set_default_group(apps, schema_editor):
|
||||
Log = apps.get_model("documents", "Log")
|
||||
for log in Log.objects.all():
|
||||
if log.group is None:
|
||||
log.group = uuid.uuid4()
|
||||
log.save()
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
("documents", "0023_document_current_filename"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AddField(
|
||||
model_name="document",
|
||||
name="archive_serial_number",
|
||||
field=models.IntegerField(
|
||||
blank=True,
|
||||
db_index=True,
|
||||
help_text="The position of this document in your physical document archive.",
|
||||
null=True,
|
||||
unique=True,
|
||||
),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name="tag",
|
||||
name="is_inbox_tag",
|
||||
field=models.BooleanField(
|
||||
default=False,
|
||||
help_text="Marks this tag as an inbox tag: All newly consumed documents will be tagged with inbox tags.",
|
||||
),
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name="DocumentType",
|
||||
fields=[
|
||||
(
|
||||
"id",
|
||||
models.AutoField(
|
||||
auto_created=True,
|
||||
primary_key=True,
|
||||
serialize=False,
|
||||
verbose_name="ID",
|
||||
),
|
||||
),
|
||||
("name", models.CharField(max_length=128, unique=True)),
|
||||
("slug", models.SlugField(blank=True, editable=False)),
|
||||
("match", models.CharField(blank=True, max_length=256)),
|
||||
(
|
||||
"matching_algorithm",
|
||||
models.PositiveIntegerField(
|
||||
choices=[
|
||||
(1, "Any"),
|
||||
(2, "All"),
|
||||
(3, "Literal"),
|
||||
(4, "Regular Expression"),
|
||||
(5, "Fuzzy Match"),
|
||||
(6, "Automatic Classification"),
|
||||
],
|
||||
default=1,
|
||||
help_text='Which algorithm you want to use when matching text to the OCR\'d PDF. Here, "any" looks for any occurrence of any word provided in the PDF, while "all" requires that every word provided appear in the PDF, albeit not in the order provided. A "literal" match means that the text you enter must appear in the PDF exactly as you\'ve entered it, and "regular expression" uses a regex to match the PDF. (If you don\'t know what a regex is, you probably don\'t want this option.) Finally, a "fuzzy match" looks for words or phrases that are mostly—but not exactly—the same, which can be useful for matching against documents containing imperfections that foil accurate OCR.',
|
||||
),
|
||||
),
|
||||
("is_insensitive", models.BooleanField(default=True)),
|
||||
],
|
||||
options={
|
||||
"abstract": False,
|
||||
"ordering": ("name",),
|
||||
},
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name="document",
|
||||
name="document_type",
|
||||
field=models.ForeignKey(
|
||||
blank=True,
|
||||
null=True,
|
||||
on_delete=django.db.models.deletion.SET_NULL,
|
||||
related_name="documents",
|
||||
to="documents.documenttype",
|
||||
),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="correspondent",
|
||||
name="matching_algorithm",
|
||||
field=models.PositiveIntegerField(
|
||||
choices=[
|
||||
(1, "Any"),
|
||||
(2, "All"),
|
||||
(3, "Literal"),
|
||||
(4, "Regular Expression"),
|
||||
(5, "Fuzzy Match"),
|
||||
(6, "Automatic Classification"),
|
||||
],
|
||||
default=1,
|
||||
help_text='Which algorithm you want to use when matching text to the OCR\'d PDF. Here, "any" looks for any occurrence of any word provided in the PDF, while "all" requires that every word provided appear in the PDF, albeit not in the order provided. A "literal" match means that the text you enter must appear in the PDF exactly as you\'ve entered it, and "regular expression" uses a regex to match the PDF. (If you don\'t know what a regex is, you probably don\'t want this option.) Finally, a "fuzzy match" looks for words or phrases that are mostly—but not exactly—the same, which can be useful for matching against documents containing imperfections that foil accurate OCR.',
|
||||
),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="tag",
|
||||
name="matching_algorithm",
|
||||
field=models.PositiveIntegerField(
|
||||
choices=[
|
||||
(1, "Any"),
|
||||
(2, "All"),
|
||||
(3, "Literal"),
|
||||
(4, "Regular Expression"),
|
||||
(5, "Fuzzy Match"),
|
||||
(6, "Automatic Classification"),
|
||||
],
|
||||
default=1,
|
||||
help_text='Which algorithm you want to use when matching text to the OCR\'d PDF. Here, "any" looks for any occurrence of any word provided in the PDF, while "all" requires that every word provided appear in the PDF, albeit not in the order provided. A "literal" match means that the text you enter must appear in the PDF exactly as you\'ve entered it, and "regular expression" uses a regex to match the PDF. (If you don\'t know what a regex is, you probably don\'t want this option.) Finally, a "fuzzy match" looks for words or phrases that are mostly—but not exactly—the same, which can be useful for matching against documents containing imperfections that foil accurate OCR.',
|
||||
),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="document",
|
||||
name="content",
|
||||
field=models.TextField(
|
||||
blank=True,
|
||||
help_text="The raw, text-only data of the document. This field is primarily used for searching.",
|
||||
),
|
||||
),
|
||||
migrations.AlterModelOptions(
|
||||
name="log",
|
||||
options={"ordering": ("-created",)},
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name="log",
|
||||
name="modified",
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="log",
|
||||
name="group",
|
||||
field=models.UUIDField(blank=True, null=True),
|
||||
),
|
||||
migrations.RunPython(
|
||||
code=django.db.migrations.operations.special.RunPython.noop,
|
||||
reverse_code=logs_set_default_group,
|
||||
),
|
||||
]
|
||||
13
src/documents/migrations/1001_auto_20201109_1636.py
Normal file
13
src/documents/migrations/1001_auto_20201109_1636.py
Normal file
@@ -0,0 +1,13 @@
|
||||
# Generated by Django 3.1.3 on 2020-11-09 16:36
|
||||
|
||||
from django.db import migrations
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
("documents", "1000_update_paperless_all"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.RunPython(migrations.RunPython.noop, migrations.RunPython.noop),
|
||||
]
|
||||
24
src/documents/migrations/1002_auto_20201111_1105.py
Normal file
24
src/documents/migrations/1002_auto_20201111_1105.py
Normal file
@@ -0,0 +1,24 @@
|
||||
# Generated by Django 3.1.3 on 2020-11-11 11:05
|
||||
|
||||
from django.db import migrations
|
||||
from django.db import models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
("documents", "1001_auto_20201109_1636"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name="document",
|
||||
name="filename",
|
||||
field=models.FilePathField(
|
||||
default=None,
|
||||
editable=False,
|
||||
help_text="Current filename in storage",
|
||||
max_length=1024,
|
||||
null=True,
|
||||
),
|
||||
),
|
||||
]
|
||||
92
src/documents/migrations/1003_mime_types.py
Normal file
92
src/documents/migrations/1003_mime_types.py
Normal file
@@ -0,0 +1,92 @@
|
||||
# Generated by Django 3.1.3 on 2020-11-20 11:21
|
||||
from pathlib import Path
|
||||
|
||||
import magic
|
||||
from django.conf import settings
|
||||
from django.db import migrations
|
||||
from django.db import models
|
||||
|
||||
from paperless.db import GnuPG
|
||||
|
||||
STORAGE_TYPE_UNENCRYPTED = "unencrypted"
|
||||
STORAGE_TYPE_GPG = "gpg"
|
||||
|
||||
|
||||
def source_path(self) -> Path:
|
||||
if self.filename:
|
||||
fname: str = str(self.filename)
|
||||
else:
|
||||
fname = f"{self.pk:07}.{self.file_type}"
|
||||
if self.storage_type == STORAGE_TYPE_GPG:
|
||||
fname += ".gpg"
|
||||
|
||||
return Path(settings.ORIGINALS_DIR) / fname
|
||||
|
||||
|
||||
def add_mime_types(apps, schema_editor):
|
||||
Document = apps.get_model("documents", "Document")
|
||||
documents = Document.objects.all()
|
||||
|
||||
for d in documents:
|
||||
with Path(source_path(d)).open("rb") as f:
|
||||
if d.storage_type == STORAGE_TYPE_GPG:
|
||||
data = GnuPG.decrypted(f)
|
||||
else:
|
||||
data = f.read(1024)
|
||||
|
||||
d.mime_type = magic.from_buffer(data, mime=True)
|
||||
d.save()
|
||||
|
||||
|
||||
def add_file_extensions(apps, schema_editor):
|
||||
Document = apps.get_model("documents", "Document")
|
||||
documents = Document.objects.all()
|
||||
|
||||
for d in documents:
|
||||
d.file_type = Path(d.filename).suffix.lstrip(".")
|
||||
d.save()
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
("documents", "1002_auto_20201111_1105"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AddField(
|
||||
model_name="document",
|
||||
name="mime_type",
|
||||
field=models.CharField(default="-", editable=False, max_length=256),
|
||||
preserve_default=False,
|
||||
),
|
||||
migrations.RunPython(add_mime_types, migrations.RunPython.noop),
|
||||
# This operation is here so that we can revert the entire migration:
|
||||
# By allowing this field to be blank and null, we can revert the
|
||||
# remove operation further down and the database won't complain about
|
||||
# NOT NULL violations.
|
||||
migrations.AlterField(
|
||||
model_name="document",
|
||||
name="file_type",
|
||||
field=models.CharField(
|
||||
choices=[
|
||||
("pdf", "PDF"),
|
||||
("png", "PNG"),
|
||||
("jpg", "JPG"),
|
||||
("gif", "GIF"),
|
||||
("tiff", "TIFF"),
|
||||
("txt", "TXT"),
|
||||
("csv", "CSV"),
|
||||
("md", "MD"),
|
||||
],
|
||||
editable=False,
|
||||
max_length=4,
|
||||
null=True,
|
||||
blank=True,
|
||||
),
|
||||
),
|
||||
migrations.RunPython(migrations.RunPython.noop, add_file_extensions),
|
||||
migrations.RemoveField(
|
||||
model_name="document",
|
||||
name="file_type",
|
||||
),
|
||||
]
|
||||
12
src/documents/migrations/1004_sanity_check_schedule.py
Normal file
12
src/documents/migrations/1004_sanity_check_schedule.py
Normal file
@@ -0,0 +1,12 @@
|
||||
# Generated by Django 3.1.3 on 2020-11-25 14:53
|
||||
|
||||
from django.db import migrations
|
||||
from django.db.migrations import RunPython
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
("documents", "1003_mime_types"),
|
||||
]
|
||||
|
||||
operations = [RunPython(migrations.RunPython.noop, migrations.RunPython.noop)]
|
||||
34
src/documents/migrations/1005_checksums.py
Normal file
34
src/documents/migrations/1005_checksums.py
Normal file
@@ -0,0 +1,34 @@
|
||||
# Generated by Django 3.1.3 on 2020-11-29 00:48
|
||||
|
||||
from django.db import migrations
|
||||
from django.db import models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
("documents", "1004_sanity_check_schedule"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AddField(
|
||||
model_name="document",
|
||||
name="archive_checksum",
|
||||
field=models.CharField(
|
||||
blank=True,
|
||||
editable=False,
|
||||
help_text="The checksum of the archived document.",
|
||||
max_length=32,
|
||||
null=True,
|
||||
),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="document",
|
||||
name="checksum",
|
||||
field=models.CharField(
|
||||
editable=False,
|
||||
help_text="The checksum of the original document.",
|
||||
max_length=32,
|
||||
unique=True,
|
||||
),
|
||||
),
|
||||
]
|
||||
24
src/documents/migrations/1006_auto_20201208_2209.py
Normal file
24
src/documents/migrations/1006_auto_20201208_2209.py
Normal file
@@ -0,0 +1,24 @@
|
||||
# Generated by Django 3.1.4 on 2020-12-08 22:09
|
||||
|
||||
from django.db import migrations
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
("documents", "1005_checksums"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.RemoveField(
|
||||
model_name="correspondent",
|
||||
name="slug",
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name="documenttype",
|
||||
name="slug",
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name="tag",
|
||||
name="slug",
|
||||
),
|
||||
]
|
||||
@@ -0,0 +1,485 @@
|
||||
# Generated by Django 4.2.13 on 2024-06-28 18:01
|
||||
|
||||
import django.db.models.deletion
|
||||
import django.utils.timezone
|
||||
from django.conf import settings
|
||||
from django.db import migrations
|
||||
from django.db import models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
replaces = [
|
||||
("documents", "1006_auto_20201208_2209"),
|
||||
("documents", "1007_savedview_savedviewfilterrule"),
|
||||
("documents", "1008_auto_20201216_1736"),
|
||||
("documents", "1009_auto_20201216_2005"),
|
||||
("documents", "1010_auto_20210101_2159"),
|
||||
("documents", "1011_auto_20210101_2340"),
|
||||
]
|
||||
|
||||
dependencies = [
|
||||
("documents", "1005_checksums"),
|
||||
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.RemoveField(
|
||||
model_name="correspondent",
|
||||
name="slug",
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name="documenttype",
|
||||
name="slug",
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name="tag",
|
||||
name="slug",
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name="SavedView",
|
||||
fields=[
|
||||
(
|
||||
"id",
|
||||
models.AutoField(
|
||||
auto_created=True,
|
||||
primary_key=True,
|
||||
serialize=False,
|
||||
verbose_name="ID",
|
||||
),
|
||||
),
|
||||
("name", models.CharField(max_length=128, verbose_name="name")),
|
||||
(
|
||||
"show_on_dashboard",
|
||||
models.BooleanField(verbose_name="show on dashboard"),
|
||||
),
|
||||
(
|
||||
"show_in_sidebar",
|
||||
models.BooleanField(verbose_name="show in sidebar"),
|
||||
),
|
||||
(
|
||||
"sort_field",
|
||||
models.CharField(max_length=128, verbose_name="sort field"),
|
||||
),
|
||||
(
|
||||
"sort_reverse",
|
||||
models.BooleanField(default=False, verbose_name="sort reverse"),
|
||||
),
|
||||
(
|
||||
"user",
|
||||
models.ForeignKey(
|
||||
on_delete=django.db.models.deletion.CASCADE,
|
||||
to=settings.AUTH_USER_MODEL,
|
||||
verbose_name="user",
|
||||
),
|
||||
),
|
||||
],
|
||||
options={
|
||||
"ordering": ("name",),
|
||||
"verbose_name": "saved view",
|
||||
"verbose_name_plural": "saved views",
|
||||
},
|
||||
),
|
||||
migrations.AlterModelOptions(
|
||||
name="correspondent",
|
||||
options={
|
||||
"ordering": ("name",),
|
||||
"verbose_name": "correspondent",
|
||||
"verbose_name_plural": "correspondents",
|
||||
},
|
||||
),
|
||||
migrations.AlterModelOptions(
|
||||
name="document",
|
||||
options={
|
||||
"ordering": ("-created",),
|
||||
"verbose_name": "document",
|
||||
"verbose_name_plural": "documents",
|
||||
},
|
||||
),
|
||||
migrations.AlterModelOptions(
|
||||
name="documenttype",
|
||||
options={
|
||||
"verbose_name": "document type",
|
||||
"verbose_name_plural": "document types",
|
||||
},
|
||||
),
|
||||
migrations.AlterModelOptions(
|
||||
name="log",
|
||||
options={
|
||||
"ordering": ("-created",),
|
||||
"verbose_name": "log",
|
||||
"verbose_name_plural": "logs",
|
||||
},
|
||||
),
|
||||
migrations.AlterModelOptions(
|
||||
name="tag",
|
||||
options={"verbose_name": "tag", "verbose_name_plural": "tags"},
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="correspondent",
|
||||
name="is_insensitive",
|
||||
field=models.BooleanField(default=True, verbose_name="is insensitive"),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="correspondent",
|
||||
name="match",
|
||||
field=models.CharField(blank=True, max_length=256, verbose_name="match"),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="correspondent",
|
||||
name="matching_algorithm",
|
||||
field=models.PositiveIntegerField(
|
||||
choices=[
|
||||
(1, "Any word"),
|
||||
(2, "All words"),
|
||||
(3, "Exact match"),
|
||||
(4, "Regular expression"),
|
||||
(5, "Fuzzy word"),
|
||||
(6, "Automatic"),
|
||||
],
|
||||
default=1,
|
||||
verbose_name="matching algorithm",
|
||||
),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="correspondent",
|
||||
name="name",
|
||||
field=models.CharField(max_length=128, unique=True, verbose_name="name"),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="document",
|
||||
name="added",
|
||||
field=models.DateTimeField(
|
||||
db_index=True,
|
||||
default=django.utils.timezone.now,
|
||||
editable=False,
|
||||
verbose_name="added",
|
||||
),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="document",
|
||||
name="archive_checksum",
|
||||
field=models.CharField(
|
||||
blank=True,
|
||||
editable=False,
|
||||
help_text="The checksum of the archived document.",
|
||||
max_length=32,
|
||||
null=True,
|
||||
verbose_name="archive checksum",
|
||||
),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="document",
|
||||
name="archive_serial_number",
|
||||
field=models.IntegerField(
|
||||
blank=True,
|
||||
db_index=True,
|
||||
help_text="The position of this document in your physical document archive.",
|
||||
null=True,
|
||||
unique=True,
|
||||
verbose_name="archive serial number",
|
||||
),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="document",
|
||||
name="checksum",
|
||||
field=models.CharField(
|
||||
editable=False,
|
||||
help_text="The checksum of the original document.",
|
||||
max_length=32,
|
||||
unique=True,
|
||||
verbose_name="checksum",
|
||||
),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="document",
|
||||
name="content",
|
||||
field=models.TextField(
|
||||
blank=True,
|
||||
help_text="The raw, text-only data of the document. This field is primarily used for searching.",
|
||||
verbose_name="content",
|
||||
),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="document",
|
||||
name="correspondent",
|
||||
field=models.ForeignKey(
|
||||
blank=True,
|
||||
null=True,
|
||||
on_delete=django.db.models.deletion.SET_NULL,
|
||||
related_name="documents",
|
||||
to="documents.correspondent",
|
||||
verbose_name="correspondent",
|
||||
),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="document",
|
||||
name="created",
|
||||
field=models.DateTimeField(
|
||||
db_index=True,
|
||||
default=django.utils.timezone.now,
|
||||
verbose_name="created",
|
||||
),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="document",
|
||||
name="document_type",
|
||||
field=models.ForeignKey(
|
||||
blank=True,
|
||||
null=True,
|
||||
on_delete=django.db.models.deletion.SET_NULL,
|
||||
related_name="documents",
|
||||
to="documents.documenttype",
|
||||
verbose_name="document type",
|
||||
),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="document",
|
||||
name="filename",
|
||||
field=models.FilePathField(
|
||||
default=None,
|
||||
editable=False,
|
||||
help_text="Current filename in storage",
|
||||
max_length=1024,
|
||||
null=True,
|
||||
verbose_name="filename",
|
||||
),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="document",
|
||||
name="mime_type",
|
||||
field=models.CharField(
|
||||
editable=False,
|
||||
max_length=256,
|
||||
verbose_name="mime type",
|
||||
),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="document",
|
||||
name="modified",
|
||||
field=models.DateTimeField(
|
||||
auto_now=True,
|
||||
db_index=True,
|
||||
verbose_name="modified",
|
||||
),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="document",
|
||||
name="storage_type",
|
||||
field=models.CharField(
|
||||
choices=[
|
||||
("unencrypted", "Unencrypted"),
|
||||
("gpg", "Encrypted with GNU Privacy Guard"),
|
||||
],
|
||||
default="unencrypted",
|
||||
editable=False,
|
||||
max_length=11,
|
||||
verbose_name="storage type",
|
||||
),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="document",
|
||||
name="tags",
|
||||
field=models.ManyToManyField(
|
||||
blank=True,
|
||||
related_name="documents",
|
||||
to="documents.tag",
|
||||
verbose_name="tags",
|
||||
),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="document",
|
||||
name="title",
|
||||
field=models.CharField(
|
||||
blank=True,
|
||||
db_index=True,
|
||||
max_length=128,
|
||||
verbose_name="title",
|
||||
),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="documenttype",
|
||||
name="is_insensitive",
|
||||
field=models.BooleanField(default=True, verbose_name="is insensitive"),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="documenttype",
|
||||
name="match",
|
||||
field=models.CharField(blank=True, max_length=256, verbose_name="match"),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="documenttype",
|
||||
name="matching_algorithm",
|
||||
field=models.PositiveIntegerField(
|
||||
choices=[
|
||||
(1, "Any word"),
|
||||
(2, "All words"),
|
||||
(3, "Exact match"),
|
||||
(4, "Regular expression"),
|
||||
(5, "Fuzzy word"),
|
||||
(6, "Automatic"),
|
||||
],
|
||||
default=1,
|
||||
verbose_name="matching algorithm",
|
||||
),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="documenttype",
|
||||
name="name",
|
||||
field=models.CharField(max_length=128, unique=True, verbose_name="name"),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="log",
|
||||
name="created",
|
||||
field=models.DateTimeField(auto_now_add=True, verbose_name="created"),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="log",
|
||||
name="group",
|
||||
field=models.UUIDField(blank=True, null=True, verbose_name="group"),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="log",
|
||||
name="level",
|
||||
field=models.PositiveIntegerField(
|
||||
choices=[
|
||||
(10, "debug"),
|
||||
(20, "information"),
|
||||
(30, "warning"),
|
||||
(40, "error"),
|
||||
(50, "critical"),
|
||||
],
|
||||
default=20,
|
||||
verbose_name="level",
|
||||
),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="log",
|
||||
name="message",
|
||||
field=models.TextField(verbose_name="message"),
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name="SavedViewFilterRule",
|
||||
fields=[
|
||||
(
|
||||
"id",
|
||||
models.AutoField(
|
||||
auto_created=True,
|
||||
primary_key=True,
|
||||
serialize=False,
|
||||
verbose_name="ID",
|
||||
),
|
||||
),
|
||||
(
|
||||
"rule_type",
|
||||
models.PositiveIntegerField(
|
||||
choices=[
|
||||
(0, "title contains"),
|
||||
(1, "content contains"),
|
||||
(2, "ASN is"),
|
||||
(3, "correspondent is"),
|
||||
(4, "document type is"),
|
||||
(5, "is in inbox"),
|
||||
(6, "has tag"),
|
||||
(7, "has any tag"),
|
||||
(8, "created before"),
|
||||
(9, "created after"),
|
||||
(10, "created year is"),
|
||||
(11, "created month is"),
|
||||
(12, "created day is"),
|
||||
(13, "added before"),
|
||||
(14, "added after"),
|
||||
(15, "modified before"),
|
||||
(16, "modified after"),
|
||||
(17, "does not have tag"),
|
||||
],
|
||||
verbose_name="rule type",
|
||||
),
|
||||
),
|
||||
(
|
||||
"value",
|
||||
models.CharField(
|
||||
blank=True,
|
||||
max_length=128,
|
||||
null=True,
|
||||
verbose_name="value",
|
||||
),
|
||||
),
|
||||
(
|
||||
"saved_view",
|
||||
models.ForeignKey(
|
||||
on_delete=django.db.models.deletion.CASCADE,
|
||||
related_name="filter_rules",
|
||||
to="documents.savedview",
|
||||
verbose_name="saved view",
|
||||
),
|
||||
),
|
||||
],
|
||||
options={
|
||||
"verbose_name": "filter rule",
|
||||
"verbose_name_plural": "filter rules",
|
||||
},
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="tag",
|
||||
name="colour",
|
||||
field=models.PositiveIntegerField(
|
||||
choices=[
|
||||
(1, "#a6cee3"),
|
||||
(2, "#1f78b4"),
|
||||
(3, "#b2df8a"),
|
||||
(4, "#33a02c"),
|
||||
(5, "#fb9a99"),
|
||||
(6, "#e31a1c"),
|
||||
(7, "#fdbf6f"),
|
||||
(8, "#ff7f00"),
|
||||
(9, "#cab2d6"),
|
||||
(10, "#6a3d9a"),
|
||||
(11, "#b15928"),
|
||||
(12, "#000000"),
|
||||
(13, "#cccccc"),
|
||||
],
|
||||
default=1,
|
||||
verbose_name="color",
|
||||
),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="tag",
|
||||
name="is_inbox_tag",
|
||||
field=models.BooleanField(
|
||||
default=False,
|
||||
help_text="Marks this tag as an inbox tag: All newly consumed documents will be tagged with inbox tags.",
|
||||
verbose_name="is inbox tag",
|
||||
),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="tag",
|
||||
name="is_insensitive",
|
||||
field=models.BooleanField(default=True, verbose_name="is insensitive"),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="tag",
|
||||
name="match",
|
||||
field=models.CharField(blank=True, max_length=256, verbose_name="match"),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="tag",
|
||||
name="matching_algorithm",
|
||||
field=models.PositiveIntegerField(
|
||||
choices=[
|
||||
(1, "Any word"),
|
||||
(2, "All words"),
|
||||
(3, "Exact match"),
|
||||
(4, "Regular expression"),
|
||||
(5, "Fuzzy word"),
|
||||
(6, "Automatic"),
|
||||
],
|
||||
default=1,
|
||||
verbose_name="matching algorithm",
|
||||
),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="tag",
|
||||
name="name",
|
||||
field=models.CharField(max_length=128, unique=True, verbose_name="name"),
|
||||
),
|
||||
]
|
||||
@@ -0,0 +1,90 @@
|
||||
# Generated by Django 3.1.4 on 2020-12-12 14:41
|
||||
|
||||
import django.db.models.deletion
|
||||
from django.conf import settings
|
||||
from django.db import migrations
|
||||
from django.db import models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
|
||||
("documents", "1006_auto_20201208_2209"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.CreateModel(
|
||||
name="SavedView",
|
||||
fields=[
|
||||
(
|
||||
"id",
|
||||
models.AutoField(
|
||||
auto_created=True,
|
||||
primary_key=True,
|
||||
serialize=False,
|
||||
verbose_name="ID",
|
||||
),
|
||||
),
|
||||
("name", models.CharField(max_length=128)),
|
||||
("show_on_dashboard", models.BooleanField()),
|
||||
("show_in_sidebar", models.BooleanField()),
|
||||
("sort_field", models.CharField(max_length=128)),
|
||||
("sort_reverse", models.BooleanField(default=False)),
|
||||
(
|
||||
"user",
|
||||
models.ForeignKey(
|
||||
on_delete=django.db.models.deletion.CASCADE,
|
||||
to=settings.AUTH_USER_MODEL,
|
||||
),
|
||||
),
|
||||
],
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name="SavedViewFilterRule",
|
||||
fields=[
|
||||
(
|
||||
"id",
|
||||
models.AutoField(
|
||||
auto_created=True,
|
||||
primary_key=True,
|
||||
serialize=False,
|
||||
verbose_name="ID",
|
||||
),
|
||||
),
|
||||
(
|
||||
"rule_type",
|
||||
models.PositiveIntegerField(
|
||||
choices=[
|
||||
(0, "Title contains"),
|
||||
(1, "Content contains"),
|
||||
(2, "ASN is"),
|
||||
(3, "Correspondent is"),
|
||||
(4, "Document type is"),
|
||||
(5, "Is in inbox"),
|
||||
(6, "Has tag"),
|
||||
(7, "Has any tag"),
|
||||
(8, "Created before"),
|
||||
(9, "Created after"),
|
||||
(10, "Created year is"),
|
||||
(11, "Created month is"),
|
||||
(12, "Created day is"),
|
||||
(13, "Added before"),
|
||||
(14, "Added after"),
|
||||
(15, "Modified before"),
|
||||
(16, "Modified after"),
|
||||
(17, "Does not have tag"),
|
||||
],
|
||||
),
|
||||
),
|
||||
("value", models.CharField(max_length=128)),
|
||||
(
|
||||
"saved_view",
|
||||
models.ForeignKey(
|
||||
on_delete=django.db.models.deletion.CASCADE,
|
||||
related_name="filter_rules",
|
||||
to="documents.savedview",
|
||||
),
|
||||
),
|
||||
],
|
||||
),
|
||||
]
|
||||
33
src/documents/migrations/1008_auto_20201216_1736.py
Normal file
33
src/documents/migrations/1008_auto_20201216_1736.py
Normal file
@@ -0,0 +1,33 @@
|
||||
# Generated by Django 3.1.4 on 2020-12-16 17:36
|
||||
|
||||
import django.db.models.functions.text
|
||||
from django.db import migrations
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
("documents", "1007_savedview_savedviewfilterrule"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterModelOptions(
|
||||
name="correspondent",
|
||||
options={"ordering": (django.db.models.functions.text.Lower("name"),)},
|
||||
),
|
||||
migrations.AlterModelOptions(
|
||||
name="document",
|
||||
options={"ordering": ("-created",)},
|
||||
),
|
||||
migrations.AlterModelOptions(
|
||||
name="documenttype",
|
||||
options={"ordering": (django.db.models.functions.text.Lower("name"),)},
|
||||
),
|
||||
migrations.AlterModelOptions(
|
||||
name="savedview",
|
||||
options={"ordering": (django.db.models.functions.text.Lower("name"),)},
|
||||
),
|
||||
migrations.AlterModelOptions(
|
||||
name="tag",
|
||||
options={"ordering": (django.db.models.functions.text.Lower("name"),)},
|
||||
),
|
||||
]
|
||||
28
src/documents/migrations/1009_auto_20201216_2005.py
Normal file
28
src/documents/migrations/1009_auto_20201216_2005.py
Normal file
@@ -0,0 +1,28 @@
|
||||
# Generated by Django 3.1.4 on 2020-12-16 20:05
|
||||
|
||||
from django.db import migrations
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
("documents", "1008_auto_20201216_1736"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterModelOptions(
|
||||
name="correspondent",
|
||||
options={"ordering": ("name",)},
|
||||
),
|
||||
migrations.AlterModelOptions(
|
||||
name="documenttype",
|
||||
options={"ordering": ("name",)},
|
||||
),
|
||||
migrations.AlterModelOptions(
|
||||
name="savedview",
|
||||
options={"ordering": ("name",)},
|
||||
),
|
||||
migrations.AlterModelOptions(
|
||||
name="tag",
|
||||
options={"ordering": ("name",)},
|
||||
),
|
||||
]
|
||||
18
src/documents/migrations/1010_auto_20210101_2159.py
Normal file
18
src/documents/migrations/1010_auto_20210101_2159.py
Normal file
@@ -0,0 +1,18 @@
|
||||
# Generated by Django 3.1.4 on 2021-01-01 21:59
|
||||
|
||||
from django.db import migrations
|
||||
from django.db import models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
("documents", "1009_auto_20201216_2005"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name="savedviewfilterrule",
|
||||
name="value",
|
||||
field=models.CharField(blank=True, max_length=128, null=True),
|
||||
),
|
||||
]
|
||||
454
src/documents/migrations/1011_auto_20210101_2340.py
Normal file
454
src/documents/migrations/1011_auto_20210101_2340.py
Normal file
@@ -0,0 +1,454 @@
|
||||
# Generated by Django 3.1.4 on 2021-01-01 23:40
|
||||
|
||||
import django.db.models.deletion
|
||||
import django.utils.timezone
|
||||
from django.conf import settings
|
||||
from django.db import migrations
|
||||
from django.db import models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
|
||||
("documents", "1010_auto_20210101_2159"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterModelOptions(
|
||||
name="correspondent",
|
||||
options={
|
||||
"ordering": ("name",),
|
||||
"verbose_name": "correspondent",
|
||||
"verbose_name_plural": "correspondents",
|
||||
},
|
||||
),
|
||||
migrations.AlterModelOptions(
|
||||
name="document",
|
||||
options={
|
||||
"ordering": ("-created",),
|
||||
"verbose_name": "document",
|
||||
"verbose_name_plural": "documents",
|
||||
},
|
||||
),
|
||||
migrations.AlterModelOptions(
|
||||
name="documenttype",
|
||||
options={
|
||||
"verbose_name": "document type",
|
||||
"verbose_name_plural": "document types",
|
||||
},
|
||||
),
|
||||
migrations.AlterModelOptions(
|
||||
name="log",
|
||||
options={
|
||||
"ordering": ("-created",),
|
||||
"verbose_name": "log",
|
||||
"verbose_name_plural": "logs",
|
||||
},
|
||||
),
|
||||
migrations.AlterModelOptions(
|
||||
name="savedview",
|
||||
options={
|
||||
"ordering": ("name",),
|
||||
"verbose_name": "saved view",
|
||||
"verbose_name_plural": "saved views",
|
||||
},
|
||||
),
|
||||
migrations.AlterModelOptions(
|
||||
name="savedviewfilterrule",
|
||||
options={
|
||||
"verbose_name": "filter rule",
|
||||
"verbose_name_plural": "filter rules",
|
||||
},
|
||||
),
|
||||
migrations.AlterModelOptions(
|
||||
name="tag",
|
||||
options={"verbose_name": "tag", "verbose_name_plural": "tags"},
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="correspondent",
|
||||
name="is_insensitive",
|
||||
field=models.BooleanField(default=True, verbose_name="is insensitive"),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="correspondent",
|
||||
name="match",
|
||||
field=models.CharField(blank=True, max_length=256, verbose_name="match"),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="correspondent",
|
||||
name="matching_algorithm",
|
||||
field=models.PositiveIntegerField(
|
||||
choices=[
|
||||
(1, "Any word"),
|
||||
(2, "All words"),
|
||||
(3, "Exact match"),
|
||||
(4, "Regular expression"),
|
||||
(5, "Fuzzy word"),
|
||||
(6, "Automatic"),
|
||||
],
|
||||
default=1,
|
||||
verbose_name="matching algorithm",
|
||||
),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="correspondent",
|
||||
name="name",
|
||||
field=models.CharField(max_length=128, unique=True, verbose_name="name"),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="document",
|
||||
name="added",
|
||||
field=models.DateTimeField(
|
||||
db_index=True,
|
||||
default=django.utils.timezone.now,
|
||||
editable=False,
|
||||
verbose_name="added",
|
||||
),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="document",
|
||||
name="archive_checksum",
|
||||
field=models.CharField(
|
||||
blank=True,
|
||||
editable=False,
|
||||
help_text="The checksum of the archived document.",
|
||||
max_length=32,
|
||||
null=True,
|
||||
verbose_name="archive checksum",
|
||||
),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="document",
|
||||
name="archive_serial_number",
|
||||
field=models.IntegerField(
|
||||
blank=True,
|
||||
db_index=True,
|
||||
help_text="The position of this document in your physical document archive.",
|
||||
null=True,
|
||||
unique=True,
|
||||
verbose_name="archive serial number",
|
||||
),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="document",
|
||||
name="checksum",
|
||||
field=models.CharField(
|
||||
editable=False,
|
||||
help_text="The checksum of the original document.",
|
||||
max_length=32,
|
||||
unique=True,
|
||||
verbose_name="checksum",
|
||||
),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="document",
|
||||
name="content",
|
||||
field=models.TextField(
|
||||
blank=True,
|
||||
help_text="The raw, text-only data of the document. This field is primarily used for searching.",
|
||||
verbose_name="content",
|
||||
),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="document",
|
||||
name="correspondent",
|
||||
field=models.ForeignKey(
|
||||
blank=True,
|
||||
null=True,
|
||||
on_delete=django.db.models.deletion.SET_NULL,
|
||||
related_name="documents",
|
||||
to="documents.correspondent",
|
||||
verbose_name="correspondent",
|
||||
),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="document",
|
||||
name="created",
|
||||
field=models.DateTimeField(
|
||||
db_index=True,
|
||||
default=django.utils.timezone.now,
|
||||
verbose_name="created",
|
||||
),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="document",
|
||||
name="document_type",
|
||||
field=models.ForeignKey(
|
||||
blank=True,
|
||||
null=True,
|
||||
on_delete=django.db.models.deletion.SET_NULL,
|
||||
related_name="documents",
|
||||
to="documents.documenttype",
|
||||
verbose_name="document type",
|
||||
),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="document",
|
||||
name="filename",
|
||||
field=models.FilePathField(
|
||||
default=None,
|
||||
editable=False,
|
||||
help_text="Current filename in storage",
|
||||
max_length=1024,
|
||||
null=True,
|
||||
verbose_name="filename",
|
||||
),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="document",
|
||||
name="mime_type",
|
||||
field=models.CharField(
|
||||
editable=False,
|
||||
max_length=256,
|
||||
verbose_name="mime type",
|
||||
),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="document",
|
||||
name="modified",
|
||||
field=models.DateTimeField(
|
||||
auto_now=True,
|
||||
db_index=True,
|
||||
verbose_name="modified",
|
||||
),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="document",
|
||||
name="storage_type",
|
||||
field=models.CharField(
|
||||
choices=[
|
||||
("unencrypted", "Unencrypted"),
|
||||
("gpg", "Encrypted with GNU Privacy Guard"),
|
||||
],
|
||||
default="unencrypted",
|
||||
editable=False,
|
||||
max_length=11,
|
||||
verbose_name="storage type",
|
||||
),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="document",
|
||||
name="tags",
|
||||
field=models.ManyToManyField(
|
||||
blank=True,
|
||||
related_name="documents",
|
||||
to="documents.Tag",
|
||||
verbose_name="tags",
|
||||
),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="document",
|
||||
name="title",
|
||||
field=models.CharField(
|
||||
blank=True,
|
||||
db_index=True,
|
||||
max_length=128,
|
||||
verbose_name="title",
|
||||
),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="documenttype",
|
||||
name="is_insensitive",
|
||||
field=models.BooleanField(default=True, verbose_name="is insensitive"),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="documenttype",
|
||||
name="match",
|
||||
field=models.CharField(blank=True, max_length=256, verbose_name="match"),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="documenttype",
|
||||
name="matching_algorithm",
|
||||
field=models.PositiveIntegerField(
|
||||
choices=[
|
||||
(1, "Any word"),
|
||||
(2, "All words"),
|
||||
(3, "Exact match"),
|
||||
(4, "Regular expression"),
|
||||
(5, "Fuzzy word"),
|
||||
(6, "Automatic"),
|
||||
],
|
||||
default=1,
|
||||
verbose_name="matching algorithm",
|
||||
),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="documenttype",
|
||||
name="name",
|
||||
field=models.CharField(max_length=128, unique=True, verbose_name="name"),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="log",
|
||||
name="created",
|
||||
field=models.DateTimeField(auto_now_add=True, verbose_name="created"),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="log",
|
||||
name="group",
|
||||
field=models.UUIDField(blank=True, null=True, verbose_name="group"),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="log",
|
||||
name="level",
|
||||
field=models.PositiveIntegerField(
|
||||
choices=[
|
||||
(10, "debug"),
|
||||
(20, "information"),
|
||||
(30, "warning"),
|
||||
(40, "error"),
|
||||
(50, "critical"),
|
||||
],
|
||||
default=20,
|
||||
verbose_name="level",
|
||||
),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="log",
|
||||
name="message",
|
||||
field=models.TextField(verbose_name="message"),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="savedview",
|
||||
name="name",
|
||||
field=models.CharField(max_length=128, verbose_name="name"),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="savedview",
|
||||
name="show_in_sidebar",
|
||||
field=models.BooleanField(verbose_name="show in sidebar"),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="savedview",
|
||||
name="show_on_dashboard",
|
||||
field=models.BooleanField(verbose_name="show on dashboard"),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="savedview",
|
||||
name="sort_field",
|
||||
field=models.CharField(max_length=128, verbose_name="sort field"),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="savedview",
|
||||
name="sort_reverse",
|
||||
field=models.BooleanField(default=False, verbose_name="sort reverse"),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="savedview",
|
||||
name="user",
|
||||
field=models.ForeignKey(
|
||||
on_delete=django.db.models.deletion.CASCADE,
|
||||
to=settings.AUTH_USER_MODEL,
|
||||
verbose_name="user",
|
||||
),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="savedviewfilterrule",
|
||||
name="rule_type",
|
||||
field=models.PositiveIntegerField(
|
||||
choices=[
|
||||
(0, "title contains"),
|
||||
(1, "content contains"),
|
||||
(2, "ASN is"),
|
||||
(3, "correspondent is"),
|
||||
(4, "document type is"),
|
||||
(5, "is in inbox"),
|
||||
(6, "has tag"),
|
||||
(7, "has any tag"),
|
||||
(8, "created before"),
|
||||
(9, "created after"),
|
||||
(10, "created year is"),
|
||||
(11, "created month is"),
|
||||
(12, "created day is"),
|
||||
(13, "added before"),
|
||||
(14, "added after"),
|
||||
(15, "modified before"),
|
||||
(16, "modified after"),
|
||||
(17, "does not have tag"),
|
||||
],
|
||||
verbose_name="rule type",
|
||||
),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="savedviewfilterrule",
|
||||
name="saved_view",
|
||||
field=models.ForeignKey(
|
||||
on_delete=django.db.models.deletion.CASCADE,
|
||||
related_name="filter_rules",
|
||||
to="documents.savedview",
|
||||
verbose_name="saved view",
|
||||
),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="savedviewfilterrule",
|
||||
name="value",
|
||||
field=models.CharField(
|
||||
blank=True,
|
||||
max_length=128,
|
||||
null=True,
|
||||
verbose_name="value",
|
||||
),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="tag",
|
||||
name="colour",
|
||||
field=models.PositiveIntegerField(
|
||||
choices=[
|
||||
(1, "#a6cee3"),
|
||||
(2, "#1f78b4"),
|
||||
(3, "#b2df8a"),
|
||||
(4, "#33a02c"),
|
||||
(5, "#fb9a99"),
|
||||
(6, "#e31a1c"),
|
||||
(7, "#fdbf6f"),
|
||||
(8, "#ff7f00"),
|
||||
(9, "#cab2d6"),
|
||||
(10, "#6a3d9a"),
|
||||
(11, "#b15928"),
|
||||
(12, "#000000"),
|
||||
(13, "#cccccc"),
|
||||
],
|
||||
default=1,
|
||||
verbose_name="color",
|
||||
),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="tag",
|
||||
name="is_inbox_tag",
|
||||
field=models.BooleanField(
|
||||
default=False,
|
||||
help_text="Marks this tag as an inbox tag: All newly consumed documents will be tagged with inbox tags.",
|
||||
verbose_name="is inbox tag",
|
||||
),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="tag",
|
||||
name="is_insensitive",
|
||||
field=models.BooleanField(default=True, verbose_name="is insensitive"),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="tag",
|
||||
name="match",
|
||||
field=models.CharField(blank=True, max_length=256, verbose_name="match"),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="tag",
|
||||
name="matching_algorithm",
|
||||
field=models.PositiveIntegerField(
|
||||
choices=[
|
||||
(1, "Any word"),
|
||||
(2, "All words"),
|
||||
(3, "Exact match"),
|
||||
(4, "Regular expression"),
|
||||
(5, "Fuzzy word"),
|
||||
(6, "Automatic"),
|
||||
],
|
||||
default=1,
|
||||
verbose_name="matching algorithm",
|
||||
),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="tag",
|
||||
name="name",
|
||||
field=models.CharField(max_length=128, unique=True, verbose_name="name"),
|
||||
),
|
||||
]
|
||||
367
src/documents/migrations/1012_fix_archive_files.py
Normal file
367
src/documents/migrations/1012_fix_archive_files.py
Normal file
@@ -0,0 +1,367 @@
|
||||
# Generated by Django 3.1.6 on 2021-02-07 22:26
|
||||
import datetime
|
||||
import hashlib
|
||||
import logging
|
||||
import os
|
||||
import shutil
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
from time import sleep
|
||||
|
||||
import pathvalidate
|
||||
from django.conf import settings
|
||||
from django.db import migrations
|
||||
from django.db import models
|
||||
from django.template.defaultfilters import slugify
|
||||
|
||||
logger = logging.getLogger("paperless.migrations")
|
||||
|
||||
|
||||
###############################################################################
|
||||
# This is code copied straight paperless before the change.
|
||||
###############################################################################
|
||||
class defaultdictNoStr(defaultdict):
|
||||
def __str__(self): # pragma: no cover
|
||||
raise ValueError("Don't use {tags} directly.")
|
||||
|
||||
|
||||
def many_to_dictionary(field): # pragma: no cover
|
||||
# Converts ManyToManyField to dictionary by assuming, that field
|
||||
# entries contain an _ or - which will be used as a delimiter
|
||||
mydictionary = dict()
|
||||
|
||||
for index, t in enumerate(field.all()):
|
||||
# Populate tag names by index
|
||||
mydictionary[index] = slugify(t.name)
|
||||
|
||||
# Find delimiter
|
||||
delimiter = t.name.find("_")
|
||||
|
||||
if delimiter == -1:
|
||||
delimiter = t.name.find("-")
|
||||
|
||||
if delimiter == -1:
|
||||
continue
|
||||
|
||||
key = t.name[:delimiter]
|
||||
value = t.name[delimiter + 1 :]
|
||||
|
||||
mydictionary[slugify(key)] = slugify(value)
|
||||
|
||||
return mydictionary
|
||||
|
||||
|
||||
def archive_name_from_filename(filename: Path) -> Path:
|
||||
return Path(filename.stem + ".pdf")
|
||||
|
||||
|
||||
def archive_path_old(doc) -> Path:
|
||||
if doc.filename:
|
||||
fname = archive_name_from_filename(Path(doc.filename))
|
||||
else:
|
||||
fname = Path(f"{doc.pk:07}.pdf")
|
||||
|
||||
return settings.ARCHIVE_DIR / fname
|
||||
|
||||
|
||||
STORAGE_TYPE_GPG = "gpg"
|
||||
|
||||
|
||||
def archive_path_new(doc) -> Path | None:
|
||||
if doc.archive_filename is not None:
|
||||
return settings.ARCHIVE_DIR / doc.archive_filename
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
def source_path(doc) -> Path:
|
||||
if doc.filename:
|
||||
fname = doc.filename
|
||||
else:
|
||||
fname = f"{doc.pk:07}{doc.file_type}"
|
||||
if doc.storage_type == STORAGE_TYPE_GPG:
|
||||
fname = Path(str(fname) + ".gpg") # pragma: no cover
|
||||
|
||||
return settings.ORIGINALS_DIR / fname
|
||||
|
||||
|
||||
def generate_unique_filename(doc, *, archive_filename=False):
|
||||
if archive_filename:
|
||||
old_filename = doc.archive_filename
|
||||
root = settings.ARCHIVE_DIR
|
||||
else:
|
||||
old_filename = doc.filename
|
||||
root = settings.ORIGINALS_DIR
|
||||
|
||||
counter = 0
|
||||
|
||||
while True:
|
||||
new_filename = generate_filename(
|
||||
doc,
|
||||
counter=counter,
|
||||
archive_filename=archive_filename,
|
||||
)
|
||||
if new_filename == old_filename:
|
||||
# still the same as before.
|
||||
return new_filename
|
||||
|
||||
if (root / new_filename).exists():
|
||||
counter += 1
|
||||
else:
|
||||
return new_filename
|
||||
|
||||
|
||||
def generate_filename(doc, *, counter=0, append_gpg=True, archive_filename=False):
|
||||
path = ""
|
||||
|
||||
try:
|
||||
if settings.FILENAME_FORMAT is not None:
|
||||
tags = defaultdictNoStr(lambda: slugify(None), many_to_dictionary(doc.tags))
|
||||
|
||||
tag_list = pathvalidate.sanitize_filename(
|
||||
",".join(sorted([tag.name for tag in doc.tags.all()])),
|
||||
replacement_text="-",
|
||||
)
|
||||
|
||||
if doc.correspondent:
|
||||
correspondent = pathvalidate.sanitize_filename(
|
||||
doc.correspondent.name,
|
||||
replacement_text="-",
|
||||
)
|
||||
else:
|
||||
correspondent = "none"
|
||||
|
||||
if doc.document_type:
|
||||
document_type = pathvalidate.sanitize_filename(
|
||||
doc.document_type.name,
|
||||
replacement_text="-",
|
||||
)
|
||||
else:
|
||||
document_type = "none"
|
||||
|
||||
path = settings.FILENAME_FORMAT.format(
|
||||
title=pathvalidate.sanitize_filename(doc.title, replacement_text="-"),
|
||||
correspondent=correspondent,
|
||||
document_type=document_type,
|
||||
created=datetime.date.isoformat(doc.created),
|
||||
created_year=doc.created.year if doc.created else "none",
|
||||
created_month=f"{doc.created.month:02}" if doc.created else "none",
|
||||
created_day=f"{doc.created.day:02}" if doc.created else "none",
|
||||
added=datetime.date.isoformat(doc.added),
|
||||
added_year=doc.added.year if doc.added else "none",
|
||||
added_month=f"{doc.added.month:02}" if doc.added else "none",
|
||||
added_day=f"{doc.added.day:02}" if doc.added else "none",
|
||||
tags=tags,
|
||||
tag_list=tag_list,
|
||||
).strip()
|
||||
|
||||
path = path.strip(os.sep)
|
||||
|
||||
except (ValueError, KeyError, IndexError):
|
||||
logger.warning(
|
||||
f"Invalid PAPERLESS_FILENAME_FORMAT: "
|
||||
f"{settings.FILENAME_FORMAT}, falling back to default",
|
||||
)
|
||||
|
||||
counter_str = f"_{counter:02}" if counter else ""
|
||||
|
||||
filetype_str = ".pdf" if archive_filename else doc.file_type
|
||||
|
||||
if len(path) > 0:
|
||||
filename = f"{path}{counter_str}{filetype_str}"
|
||||
else:
|
||||
filename = f"{doc.pk:07}{counter_str}{filetype_str}"
|
||||
|
||||
# Append .gpg for encrypted files
|
||||
if append_gpg and doc.storage_type == STORAGE_TYPE_GPG:
|
||||
filename += ".gpg"
|
||||
|
||||
return filename
|
||||
|
||||
|
||||
###############################################################################
|
||||
# This code performs bidirection archive file transformation.
|
||||
###############################################################################
|
||||
|
||||
|
||||
def parse_wrapper(parser, path, mime_type, file_name):
|
||||
# this is here so that I can mock this out for testing.
|
||||
parser.parse(path, mime_type, file_name)
|
||||
|
||||
|
||||
def create_archive_version(doc, retry_count=3):
|
||||
from documents.parsers import DocumentParser
|
||||
from documents.parsers import ParseError
|
||||
from documents.parsers import get_parser_class_for_mime_type
|
||||
|
||||
logger.info(f"Regenerating archive document for document ID:{doc.id}")
|
||||
parser_class = get_parser_class_for_mime_type(doc.mime_type)
|
||||
for try_num in range(retry_count):
|
||||
parser: DocumentParser = parser_class(None, None)
|
||||
try:
|
||||
parse_wrapper(
|
||||
parser,
|
||||
source_path(doc),
|
||||
doc.mime_type,
|
||||
Path(doc.filename).name,
|
||||
)
|
||||
doc.content = parser.get_text()
|
||||
|
||||
if parser.get_archive_path() and Path(parser.get_archive_path()).is_file():
|
||||
doc.archive_filename = generate_unique_filename(
|
||||
doc,
|
||||
archive_filename=True,
|
||||
)
|
||||
with Path(parser.get_archive_path()).open("rb") as f:
|
||||
doc.archive_checksum = hashlib.md5(f.read()).hexdigest()
|
||||
archive_path_new(doc).parent.mkdir(parents=True, exist_ok=True)
|
||||
shutil.copy2(parser.get_archive_path(), archive_path_new(doc))
|
||||
else:
|
||||
doc.archive_checksum = None
|
||||
logger.error(
|
||||
f"Parser did not return an archive document for document "
|
||||
f"ID:{doc.id}. Removing archive document.",
|
||||
)
|
||||
doc.save()
|
||||
return
|
||||
except ParseError:
|
||||
if try_num + 1 == retry_count:
|
||||
logger.exception(
|
||||
f"Unable to regenerate archive document for ID:{doc.id}. You "
|
||||
f"need to invoke the document_archiver management command "
|
||||
f"manually for that document.",
|
||||
)
|
||||
doc.archive_checksum = None
|
||||
doc.save()
|
||||
return
|
||||
else:
|
||||
# This is mostly here for the tika parser in docker
|
||||
# environments. The servers for parsing need to come up first,
|
||||
# and the docker setup doesn't ensure that tika is running
|
||||
# before attempting migrations.
|
||||
logger.error("Parse error, will try again in 5 seconds...")
|
||||
sleep(5)
|
||||
finally:
|
||||
parser.cleanup()
|
||||
|
||||
|
||||
def move_old_to_new_locations(apps, schema_editor):
|
||||
Document = apps.get_model("documents", "Document")
|
||||
|
||||
affected_document_ids = set()
|
||||
|
||||
old_archive_path_to_id = {}
|
||||
|
||||
# check for documents that have incorrect archive versions
|
||||
for doc in Document.objects.filter(archive_checksum__isnull=False):
|
||||
old_path = archive_path_old(doc)
|
||||
|
||||
if old_path in old_archive_path_to_id:
|
||||
affected_document_ids.add(doc.id)
|
||||
affected_document_ids.add(old_archive_path_to_id[old_path])
|
||||
else:
|
||||
old_archive_path_to_id[old_path] = doc.id
|
||||
|
||||
# check that archive files of all unaffected documents are in place
|
||||
for doc in Document.objects.filter(archive_checksum__isnull=False):
|
||||
old_path = archive_path_old(doc)
|
||||
if doc.id not in affected_document_ids and not old_path.is_file():
|
||||
raise ValueError(
|
||||
f"Archived document ID:{doc.id} does not exist at: {old_path}",
|
||||
)
|
||||
|
||||
# check that we can regenerate affected archive versions
|
||||
for doc_id in affected_document_ids:
|
||||
from documents.parsers import get_parser_class_for_mime_type
|
||||
|
||||
doc = Document.objects.get(id=doc_id)
|
||||
parser_class = get_parser_class_for_mime_type(doc.mime_type)
|
||||
if not parser_class:
|
||||
raise ValueError(
|
||||
f"Document ID:{doc.id} has an invalid archived document, "
|
||||
f"but no parsers are available. Cannot migrate.",
|
||||
)
|
||||
|
||||
for doc in Document.objects.filter(archive_checksum__isnull=False):
|
||||
if doc.id in affected_document_ids:
|
||||
old_path = archive_path_old(doc)
|
||||
# remove affected archive versions
|
||||
if old_path.is_file():
|
||||
logger.debug(f"Removing {old_path}")
|
||||
old_path.unlink()
|
||||
else:
|
||||
# Set archive path for unaffected files
|
||||
doc.archive_filename = archive_name_from_filename(Path(doc.filename))
|
||||
Document.objects.filter(id=doc.id).update(
|
||||
archive_filename=doc.archive_filename,
|
||||
)
|
||||
|
||||
# regenerate archive documents
|
||||
for doc_id in affected_document_ids:
|
||||
doc = Document.objects.get(id=doc_id)
|
||||
create_archive_version(doc)
|
||||
|
||||
|
||||
def move_new_to_old_locations(apps, schema_editor):
|
||||
Document = apps.get_model("documents", "Document")
|
||||
|
||||
old_archive_paths = set()
|
||||
|
||||
for doc in Document.objects.filter(archive_checksum__isnull=False):
|
||||
new_archive_path = archive_path_new(doc)
|
||||
old_archive_path = archive_path_old(doc)
|
||||
if old_archive_path in old_archive_paths:
|
||||
raise ValueError(
|
||||
f"Cannot migrate: Archive file name {old_archive_path} of "
|
||||
f"document {doc.filename} would clash with another archive "
|
||||
f"filename.",
|
||||
)
|
||||
old_archive_paths.add(old_archive_path)
|
||||
if new_archive_path != old_archive_path and old_archive_path.is_file():
|
||||
raise ValueError(
|
||||
f"Cannot migrate: Cannot move {new_archive_path} to "
|
||||
f"{old_archive_path}: file already exists.",
|
||||
)
|
||||
|
||||
for doc in Document.objects.filter(archive_checksum__isnull=False):
|
||||
new_archive_path = archive_path_new(doc)
|
||||
old_archive_path = archive_path_old(doc)
|
||||
if new_archive_path != old_archive_path:
|
||||
logger.debug(f"Moving {new_archive_path} to {old_archive_path}")
|
||||
shutil.move(new_archive_path, old_archive_path)
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
("documents", "1011_auto_20210101_2340"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AddField(
|
||||
model_name="document",
|
||||
name="archive_filename",
|
||||
field=models.FilePathField(
|
||||
default=None,
|
||||
editable=False,
|
||||
help_text="Current archive filename in storage",
|
||||
max_length=1024,
|
||||
null=True,
|
||||
unique=True,
|
||||
verbose_name="archive filename",
|
||||
),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="document",
|
||||
name="filename",
|
||||
field=models.FilePathField(
|
||||
default=None,
|
||||
editable=False,
|
||||
help_text="Current filename in storage",
|
||||
max_length=1024,
|
||||
null=True,
|
||||
unique=True,
|
||||
verbose_name="filename",
|
||||
),
|
||||
),
|
||||
migrations.RunPython(move_old_to_new_locations, move_new_to_old_locations),
|
||||
]
|
||||
74
src/documents/migrations/1013_migrate_tag_colour.py
Normal file
74
src/documents/migrations/1013_migrate_tag_colour.py
Normal file
@@ -0,0 +1,74 @@
|
||||
# Generated by Django 3.1.4 on 2020-12-02 21:43
|
||||
|
||||
from django.db import migrations
|
||||
from django.db import models
|
||||
|
||||
COLOURS_OLD = {
|
||||
1: "#a6cee3",
|
||||
2: "#1f78b4",
|
||||
3: "#b2df8a",
|
||||
4: "#33a02c",
|
||||
5: "#fb9a99",
|
||||
6: "#e31a1c",
|
||||
7: "#fdbf6f",
|
||||
8: "#ff7f00",
|
||||
9: "#cab2d6",
|
||||
10: "#6a3d9a",
|
||||
11: "#b15928",
|
||||
12: "#000000",
|
||||
13: "#cccccc",
|
||||
}
|
||||
|
||||
|
||||
def forward(apps, schema_editor):
|
||||
Tag = apps.get_model("documents", "Tag")
|
||||
|
||||
for tag in Tag.objects.all():
|
||||
colour_old_id = tag.colour_old
|
||||
rgb = COLOURS_OLD[colour_old_id]
|
||||
tag.color = rgb
|
||||
tag.save()
|
||||
|
||||
|
||||
def reverse(apps, schema_editor):
|
||||
Tag = apps.get_model("documents", "Tag")
|
||||
|
||||
def _get_colour_id(rdb):
|
||||
for idx, rdbx in COLOURS_OLD.items():
|
||||
if rdbx == rdb:
|
||||
return idx
|
||||
# Return colour 1 if we can't match anything
|
||||
return 1
|
||||
|
||||
for tag in Tag.objects.all():
|
||||
colour_id = _get_colour_id(tag.color)
|
||||
tag.colour_old = colour_id
|
||||
tag.save()
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
("documents", "1012_fix_archive_files"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.RenameField(
|
||||
model_name="tag",
|
||||
old_name="colour",
|
||||
new_name="colour_old",
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name="tag",
|
||||
name="color",
|
||||
field=models.CharField(
|
||||
default="#a6cee3",
|
||||
max_length=7,
|
||||
verbose_name="color",
|
||||
),
|
||||
),
|
||||
migrations.RunPython(forward, reverse),
|
||||
migrations.RemoveField(
|
||||
model_name="tag",
|
||||
name="colour_old",
|
||||
),
|
||||
]
|
||||
42
src/documents/migrations/1014_auto_20210228_1614.py
Normal file
42
src/documents/migrations/1014_auto_20210228_1614.py
Normal file
@@ -0,0 +1,42 @@
|
||||
# Generated by Django 3.1.7 on 2021-02-28 15:14
|
||||
|
||||
from django.db import migrations
|
||||
from django.db import models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
("documents", "1013_migrate_tag_colour"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name="savedviewfilterrule",
|
||||
name="rule_type",
|
||||
field=models.PositiveIntegerField(
|
||||
choices=[
|
||||
(0, "title contains"),
|
||||
(1, "content contains"),
|
||||
(2, "ASN is"),
|
||||
(3, "correspondent is"),
|
||||
(4, "document type is"),
|
||||
(5, "is in inbox"),
|
||||
(6, "has tag"),
|
||||
(7, "has any tag"),
|
||||
(8, "created before"),
|
||||
(9, "created after"),
|
||||
(10, "created year is"),
|
||||
(11, "created month is"),
|
||||
(12, "created day is"),
|
||||
(13, "added before"),
|
||||
(14, "added after"),
|
||||
(15, "modified before"),
|
||||
(16, "modified after"),
|
||||
(17, "does not have tag"),
|
||||
(18, "does not have ASN"),
|
||||
(19, "title or content contains"),
|
||||
],
|
||||
verbose_name="rule type",
|
||||
),
|
||||
),
|
||||
]
|
||||
27
src/documents/migrations/1015_remove_null_characters.py
Normal file
27
src/documents/migrations/1015_remove_null_characters.py
Normal file
@@ -0,0 +1,27 @@
|
||||
# Generated by Django 3.1.7 on 2021-04-04 18:28
|
||||
import logging
|
||||
|
||||
from django.db import migrations
|
||||
|
||||
logger = logging.getLogger("paperless.migrations")
|
||||
|
||||
|
||||
def remove_null_characters(apps, schema_editor):
|
||||
Document = apps.get_model("documents", "Document")
|
||||
|
||||
for doc in Document.objects.all():
|
||||
content: str = doc.content
|
||||
if "\0" in content:
|
||||
logger.info(f"Removing null characters from document {doc}...")
|
||||
doc.content = content.replace("\0", " ")
|
||||
doc.save()
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
("documents", "1014_auto_20210228_1614"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.RunPython(remove_null_characters, migrations.RunPython.noop),
|
||||
]
|
||||
54
src/documents/migrations/1016_auto_20210317_1351.py
Normal file
54
src/documents/migrations/1016_auto_20210317_1351.py
Normal file
@@ -0,0 +1,54 @@
|
||||
# Generated by Django 3.1.7 on 2021-03-17 12:51
|
||||
|
||||
from django.db import migrations
|
||||
from django.db import models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
("documents", "1015_remove_null_characters"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name="savedview",
|
||||
name="sort_field",
|
||||
field=models.CharField(
|
||||
blank=True,
|
||||
max_length=128,
|
||||
null=True,
|
||||
verbose_name="sort field",
|
||||
),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="savedviewfilterrule",
|
||||
name="rule_type",
|
||||
field=models.PositiveIntegerField(
|
||||
choices=[
|
||||
(0, "title contains"),
|
||||
(1, "content contains"),
|
||||
(2, "ASN is"),
|
||||
(3, "correspondent is"),
|
||||
(4, "document type is"),
|
||||
(5, "is in inbox"),
|
||||
(6, "has tag"),
|
||||
(7, "has any tag"),
|
||||
(8, "created before"),
|
||||
(9, "created after"),
|
||||
(10, "created year is"),
|
||||
(11, "created month is"),
|
||||
(12, "created day is"),
|
||||
(13, "added before"),
|
||||
(14, "added after"),
|
||||
(15, "modified before"),
|
||||
(16, "modified after"),
|
||||
(17, "does not have tag"),
|
||||
(18, "does not have ASN"),
|
||||
(19, "title or content contains"),
|
||||
(20, "fulltext query"),
|
||||
(21, "more like this"),
|
||||
],
|
||||
verbose_name="rule type",
|
||||
),
|
||||
),
|
||||
]
|
||||
@@ -0,0 +1,190 @@
|
||||
# Generated by Django 4.2.13 on 2024-06-28 18:09
|
||||
|
||||
import django.db.models.deletion
|
||||
from django.conf import settings
|
||||
from django.db import migrations
|
||||
from django.db import models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
replaces = [
|
||||
("documents", "1016_auto_20210317_1351"),
|
||||
("documents", "1017_alter_savedviewfilterrule_rule_type"),
|
||||
("documents", "1018_alter_savedviewfilterrule_value"),
|
||||
("documents", "1019_uisettings"),
|
||||
("documents", "1019_storagepath_document_storage_path"),
|
||||
("documents", "1020_merge_20220518_1839"),
|
||||
]
|
||||
|
||||
dependencies = [
|
||||
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
|
||||
("documents", "1015_remove_null_characters"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name="savedview",
|
||||
name="sort_field",
|
||||
field=models.CharField(
|
||||
blank=True,
|
||||
max_length=128,
|
||||
null=True,
|
||||
verbose_name="sort field",
|
||||
),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="savedviewfilterrule",
|
||||
name="rule_type",
|
||||
field=models.PositiveIntegerField(
|
||||
choices=[
|
||||
(0, "title contains"),
|
||||
(1, "content contains"),
|
||||
(2, "ASN is"),
|
||||
(3, "correspondent is"),
|
||||
(4, "document type is"),
|
||||
(5, "is in inbox"),
|
||||
(6, "has tag"),
|
||||
(7, "has any tag"),
|
||||
(8, "created before"),
|
||||
(9, "created after"),
|
||||
(10, "created year is"),
|
||||
(11, "created month is"),
|
||||
(12, "created day is"),
|
||||
(13, "added before"),
|
||||
(14, "added after"),
|
||||
(15, "modified before"),
|
||||
(16, "modified after"),
|
||||
(17, "does not have tag"),
|
||||
(18, "does not have ASN"),
|
||||
(19, "title or content contains"),
|
||||
(20, "fulltext query"),
|
||||
(21, "more like this"),
|
||||
],
|
||||
verbose_name="rule type",
|
||||
),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="savedviewfilterrule",
|
||||
name="rule_type",
|
||||
field=models.PositiveIntegerField(
|
||||
choices=[
|
||||
(0, "title contains"),
|
||||
(1, "content contains"),
|
||||
(2, "ASN is"),
|
||||
(3, "correspondent is"),
|
||||
(4, "document type is"),
|
||||
(5, "is in inbox"),
|
||||
(6, "has tag"),
|
||||
(7, "has any tag"),
|
||||
(8, "created before"),
|
||||
(9, "created after"),
|
||||
(10, "created year is"),
|
||||
(11, "created month is"),
|
||||
(12, "created day is"),
|
||||
(13, "added before"),
|
||||
(14, "added after"),
|
||||
(15, "modified before"),
|
||||
(16, "modified after"),
|
||||
(17, "does not have tag"),
|
||||
(18, "does not have ASN"),
|
||||
(19, "title or content contains"),
|
||||
(20, "fulltext query"),
|
||||
(21, "more like this"),
|
||||
(22, "has tags in"),
|
||||
],
|
||||
verbose_name="rule type",
|
||||
),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="savedviewfilterrule",
|
||||
name="value",
|
||||
field=models.CharField(
|
||||
blank=True,
|
||||
max_length=255,
|
||||
null=True,
|
||||
verbose_name="value",
|
||||
),
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name="UiSettings",
|
||||
fields=[
|
||||
(
|
||||
"id",
|
||||
models.AutoField(
|
||||
auto_created=True,
|
||||
primary_key=True,
|
||||
serialize=False,
|
||||
verbose_name="ID",
|
||||
),
|
||||
),
|
||||
("settings", models.JSONField(null=True)),
|
||||
(
|
||||
"user",
|
||||
models.OneToOneField(
|
||||
on_delete=django.db.models.deletion.CASCADE,
|
||||
related_name="ui_settings",
|
||||
to=settings.AUTH_USER_MODEL,
|
||||
),
|
||||
),
|
||||
],
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name="StoragePath",
|
||||
fields=[
|
||||
(
|
||||
"id",
|
||||
models.AutoField(
|
||||
auto_created=True,
|
||||
primary_key=True,
|
||||
serialize=False,
|
||||
verbose_name="ID",
|
||||
),
|
||||
),
|
||||
(
|
||||
"name",
|
||||
models.CharField(max_length=128, unique=True, verbose_name="name"),
|
||||
),
|
||||
(
|
||||
"match",
|
||||
models.CharField(blank=True, max_length=256, verbose_name="match"),
|
||||
),
|
||||
(
|
||||
"matching_algorithm",
|
||||
models.PositiveIntegerField(
|
||||
choices=[
|
||||
(1, "Any word"),
|
||||
(2, "All words"),
|
||||
(3, "Exact match"),
|
||||
(4, "Regular expression"),
|
||||
(5, "Fuzzy word"),
|
||||
(6, "Automatic"),
|
||||
],
|
||||
default=1,
|
||||
verbose_name="matching algorithm",
|
||||
),
|
||||
),
|
||||
(
|
||||
"is_insensitive",
|
||||
models.BooleanField(default=True, verbose_name="is insensitive"),
|
||||
),
|
||||
("path", models.CharField(max_length=512, verbose_name="path")),
|
||||
],
|
||||
options={
|
||||
"verbose_name": "storage path",
|
||||
"verbose_name_plural": "storage paths",
|
||||
"ordering": ("name",),
|
||||
},
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name="document",
|
||||
name="storage_path",
|
||||
field=models.ForeignKey(
|
||||
blank=True,
|
||||
null=True,
|
||||
on_delete=django.db.models.deletion.SET_NULL,
|
||||
related_name="documents",
|
||||
to="documents.storagepath",
|
||||
verbose_name="storage path",
|
||||
),
|
||||
),
|
||||
]
|
||||
@@ -0,0 +1,45 @@
|
||||
# Generated by Django 3.2.12 on 2022-03-17 11:59
|
||||
|
||||
from django.db import migrations
|
||||
from django.db import models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
("documents", "1016_auto_20210317_1351"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name="savedviewfilterrule",
|
||||
name="rule_type",
|
||||
field=models.PositiveIntegerField(
|
||||
choices=[
|
||||
(0, "title contains"),
|
||||
(1, "content contains"),
|
||||
(2, "ASN is"),
|
||||
(3, "correspondent is"),
|
||||
(4, "document type is"),
|
||||
(5, "is in inbox"),
|
||||
(6, "has tag"),
|
||||
(7, "has any tag"),
|
||||
(8, "created before"),
|
||||
(9, "created after"),
|
||||
(10, "created year is"),
|
||||
(11, "created month is"),
|
||||
(12, "created day is"),
|
||||
(13, "added before"),
|
||||
(14, "added after"),
|
||||
(15, "modified before"),
|
||||
(16, "modified after"),
|
||||
(17, "does not have tag"),
|
||||
(18, "does not have ASN"),
|
||||
(19, "title or content contains"),
|
||||
(20, "fulltext query"),
|
||||
(21, "more like this"),
|
||||
(22, "has tags in"),
|
||||
],
|
||||
verbose_name="rule type",
|
||||
),
|
||||
),
|
||||
]
|
||||
@@ -0,0 +1,23 @@
|
||||
# Generated by Django 4.0.3 on 2022-04-01 22:50
|
||||
|
||||
from django.db import migrations
|
||||
from django.db import models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
("documents", "1017_alter_savedviewfilterrule_rule_type"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name="savedviewfilterrule",
|
||||
name="value",
|
||||
field=models.CharField(
|
||||
blank=True,
|
||||
max_length=255,
|
||||
null=True,
|
||||
verbose_name="value",
|
||||
),
|
||||
),
|
||||
]
|
||||
@@ -0,0 +1,73 @@
|
||||
# Generated by Django 4.0.4 on 2022-05-02 15:56
|
||||
|
||||
import django.db.models.deletion
|
||||
from django.db import migrations
|
||||
from django.db import models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
("documents", "1018_alter_savedviewfilterrule_value"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.CreateModel(
|
||||
name="StoragePath",
|
||||
fields=[
|
||||
(
|
||||
"id",
|
||||
models.AutoField(
|
||||
auto_created=True,
|
||||
primary_key=True,
|
||||
serialize=False,
|
||||
verbose_name="ID",
|
||||
),
|
||||
),
|
||||
(
|
||||
"name",
|
||||
models.CharField(max_length=128, unique=True, verbose_name="name"),
|
||||
),
|
||||
(
|
||||
"match",
|
||||
models.CharField(blank=True, max_length=256, verbose_name="match"),
|
||||
),
|
||||
(
|
||||
"matching_algorithm",
|
||||
models.PositiveIntegerField(
|
||||
choices=[
|
||||
(1, "Any word"),
|
||||
(2, "All words"),
|
||||
(3, "Exact match"),
|
||||
(4, "Regular expression"),
|
||||
(5, "Fuzzy word"),
|
||||
(6, "Automatic"),
|
||||
],
|
||||
default=1,
|
||||
verbose_name="matching algorithm",
|
||||
),
|
||||
),
|
||||
(
|
||||
"is_insensitive",
|
||||
models.BooleanField(default=True, verbose_name="is insensitive"),
|
||||
),
|
||||
("path", models.CharField(max_length=512, verbose_name="path")),
|
||||
],
|
||||
options={
|
||||
"verbose_name": "storage path",
|
||||
"verbose_name_plural": "storage paths",
|
||||
"ordering": ("name",),
|
||||
},
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name="document",
|
||||
name="storage_path",
|
||||
field=models.ForeignKey(
|
||||
blank=True,
|
||||
null=True,
|
||||
on_delete=django.db.models.deletion.SET_NULL,
|
||||
related_name="documents",
|
||||
to="documents.storagepath",
|
||||
verbose_name="storage path",
|
||||
),
|
||||
),
|
||||
]
|
||||
39
src/documents/migrations/1019_uisettings.py
Normal file
39
src/documents/migrations/1019_uisettings.py
Normal file
@@ -0,0 +1,39 @@
|
||||
# Generated by Django 4.0.4 on 2022-05-07 05:10
|
||||
|
||||
import django.db.models.deletion
|
||||
from django.conf import settings
|
||||
from django.db import migrations
|
||||
from django.db import models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
|
||||
("documents", "1018_alter_savedviewfilterrule_value"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.CreateModel(
|
||||
name="UiSettings",
|
||||
fields=[
|
||||
(
|
||||
"id",
|
||||
models.AutoField(
|
||||
auto_created=True,
|
||||
primary_key=True,
|
||||
serialize=False,
|
||||
verbose_name="ID",
|
||||
),
|
||||
),
|
||||
("settings", models.JSONField(null=True)),
|
||||
(
|
||||
"user",
|
||||
models.OneToOneField(
|
||||
on_delete=django.db.models.deletion.CASCADE,
|
||||
related_name="ui_settings",
|
||||
to=settings.AUTH_USER_MODEL,
|
||||
),
|
||||
),
|
||||
],
|
||||
),
|
||||
]
|
||||
12
src/documents/migrations/1020_merge_20220518_1839.py
Normal file
12
src/documents/migrations/1020_merge_20220518_1839.py
Normal file
@@ -0,0 +1,12 @@
|
||||
# Generated by Django 4.0.4 on 2022-05-18 18:39
|
||||
|
||||
from django.db import migrations
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
("documents", "1019_storagepath_document_storage_path"),
|
||||
("documents", "1019_uisettings"),
|
||||
]
|
||||
|
||||
operations = []
|
||||
104
src/documents/migrations/1021_webp_thumbnail_conversion.py
Normal file
104
src/documents/migrations/1021_webp_thumbnail_conversion.py
Normal file
@@ -0,0 +1,104 @@
|
||||
# Generated by Django 4.0.5 on 2022-06-11 15:40
|
||||
import logging
|
||||
import multiprocessing.pool
|
||||
import shutil
|
||||
import tempfile
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
from django.conf import settings
|
||||
from django.db import migrations
|
||||
|
||||
from documents.parsers import run_convert
|
||||
|
||||
logger = logging.getLogger("paperless.migrations")
|
||||
|
||||
|
||||
def _do_convert(work_package):
|
||||
existing_thumbnail, converted_thumbnail = work_package
|
||||
try:
|
||||
logger.info(f"Converting thumbnail: {existing_thumbnail}")
|
||||
|
||||
# Run actual conversion
|
||||
run_convert(
|
||||
density=300,
|
||||
scale="500x5000>",
|
||||
alpha="remove",
|
||||
strip=True,
|
||||
trim=False,
|
||||
auto_orient=True,
|
||||
input_file=f"{existing_thumbnail}[0]",
|
||||
output_file=str(converted_thumbnail),
|
||||
)
|
||||
|
||||
# Copy newly created thumbnail to thumbnail directory
|
||||
shutil.copy(converted_thumbnail, existing_thumbnail.parent)
|
||||
|
||||
# Remove the PNG version
|
||||
existing_thumbnail.unlink()
|
||||
|
||||
logger.info(
|
||||
"Conversion to WebP completed, "
|
||||
f"replaced {existing_thumbnail.name} with {converted_thumbnail.name}",
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error converting thumbnail (existing file unchanged): {e}")
|
||||
|
||||
|
||||
def _convert_thumbnails_to_webp(apps, schema_editor):
|
||||
start = time.time()
|
||||
|
||||
with tempfile.TemporaryDirectory() as tempdir:
|
||||
work_packages = []
|
||||
|
||||
for file in Path(settings.THUMBNAIL_DIR).glob("*.png"):
|
||||
existing_thumbnail = file.resolve()
|
||||
|
||||
# Change the existing filename suffix from png to webp
|
||||
converted_thumbnail_name = existing_thumbnail.with_suffix(
|
||||
".webp",
|
||||
).name
|
||||
|
||||
# Create the expected output filename in the tempdir
|
||||
converted_thumbnail = (
|
||||
Path(tempdir) / Path(converted_thumbnail_name)
|
||||
).resolve()
|
||||
|
||||
# Package up the necessary info
|
||||
work_packages.append(
|
||||
(existing_thumbnail, converted_thumbnail),
|
||||
)
|
||||
|
||||
if work_packages:
|
||||
logger.info(
|
||||
"\n\n"
|
||||
" This is a one-time only migration to convert thumbnails for all of your\n"
|
||||
" documents into WebP format. If you have a lot of documents though, \n"
|
||||
" this may take a while, so a coffee break may be in order."
|
||||
"\n",
|
||||
)
|
||||
|
||||
with multiprocessing.pool.Pool(
|
||||
processes=min(multiprocessing.cpu_count(), 4),
|
||||
maxtasksperchild=4,
|
||||
) as pool:
|
||||
pool.map(_do_convert, work_packages)
|
||||
|
||||
end = time.time()
|
||||
duration = end - start
|
||||
|
||||
logger.info(f"Conversion completed in {duration:.3f}s")
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
("documents", "1020_merge_20220518_1839"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.RunPython(
|
||||
code=_convert_thumbnails_to_webp,
|
||||
reverse_code=migrations.RunPython.noop,
|
||||
),
|
||||
]
|
||||
52
src/documents/migrations/1022_paperlesstask.py
Normal file
52
src/documents/migrations/1022_paperlesstask.py
Normal file
@@ -0,0 +1,52 @@
|
||||
# Generated by Django 4.0.4 on 2022-05-23 07:14
|
||||
|
||||
import django.db.models.deletion
|
||||
from django.db import migrations
|
||||
from django.db import models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
("documents", "1021_webp_thumbnail_conversion"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.CreateModel(
|
||||
name="PaperlessTask",
|
||||
fields=[
|
||||
(
|
||||
"id",
|
||||
models.AutoField(
|
||||
auto_created=True,
|
||||
primary_key=True,
|
||||
serialize=False,
|
||||
verbose_name="ID",
|
||||
),
|
||||
),
|
||||
("task_id", models.CharField(max_length=128)),
|
||||
("name", models.CharField(max_length=256, null=True)),
|
||||
(
|
||||
"created",
|
||||
models.DateTimeField(auto_now=True, verbose_name="created"),
|
||||
),
|
||||
(
|
||||
"started",
|
||||
models.DateTimeField(null=True, verbose_name="started"),
|
||||
),
|
||||
("acknowledged", models.BooleanField(default=False)),
|
||||
(
|
||||
"attempted_task",
|
||||
models.OneToOneField(
|
||||
blank=True,
|
||||
null=True,
|
||||
on_delete=django.db.models.deletion.CASCADE,
|
||||
related_name="attempted_task",
|
||||
# This is a dummy field, 1026 will fix up the column
|
||||
# This manual change is required, as django doesn't django doesn't really support
|
||||
# removing an app which has migration deps like this
|
||||
to="documents.document",
|
||||
),
|
||||
),
|
||||
],
|
||||
),
|
||||
]
|
||||
@@ -0,0 +1,668 @@
|
||||
# Generated by Django 4.2.13 on 2024-06-28 18:10
|
||||
|
||||
import django.core.validators
|
||||
import django.db.models.deletion
|
||||
import django.utils.timezone
|
||||
from django.conf import settings
|
||||
from django.db import migrations
|
||||
from django.db import models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
replaces = [
|
||||
("documents", "1022_paperlesstask"),
|
||||
("documents", "1023_add_comments"),
|
||||
("documents", "1024_document_original_filename"),
|
||||
("documents", "1025_alter_savedviewfilterrule_rule_type"),
|
||||
("documents", "1026_transition_to_celery"),
|
||||
("documents", "1027_remove_paperlesstask_attempted_task_and_more"),
|
||||
("documents", "1028_remove_paperlesstask_task_args_and_more"),
|
||||
("documents", "1029_alter_document_archive_serial_number"),
|
||||
("documents", "1030_alter_paperlesstask_task_file_name"),
|
||||
("documents", "1031_remove_savedview_user_correspondent_owner_and_more"),
|
||||
("documents", "1032_alter_correspondent_matching_algorithm_and_more"),
|
||||
("documents", "1033_alter_documenttype_options_alter_tag_options_and_more"),
|
||||
("documents", "1034_alter_savedviewfilterrule_rule_type"),
|
||||
("documents", "1035_rename_comment_note"),
|
||||
("documents", "1036_alter_savedviewfilterrule_rule_type"),
|
||||
]
|
||||
|
||||
dependencies = [
|
||||
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
|
||||
("django_celery_results", "0011_taskresult_periodic_task_name"),
|
||||
("documents", "1021_webp_thumbnail_conversion"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.CreateModel(
|
||||
name="Comment",
|
||||
fields=[
|
||||
(
|
||||
"id",
|
||||
models.AutoField(
|
||||
auto_created=True,
|
||||
primary_key=True,
|
||||
serialize=False,
|
||||
verbose_name="ID",
|
||||
),
|
||||
),
|
||||
(
|
||||
"comment",
|
||||
models.TextField(
|
||||
blank=True,
|
||||
help_text="Comment for the document",
|
||||
verbose_name="content",
|
||||
),
|
||||
),
|
||||
(
|
||||
"created",
|
||||
models.DateTimeField(
|
||||
db_index=True,
|
||||
default=django.utils.timezone.now,
|
||||
verbose_name="created",
|
||||
),
|
||||
),
|
||||
(
|
||||
"document",
|
||||
models.ForeignKey(
|
||||
blank=True,
|
||||
null=True,
|
||||
on_delete=django.db.models.deletion.CASCADE,
|
||||
related_name="documents",
|
||||
to="documents.document",
|
||||
verbose_name="document",
|
||||
),
|
||||
),
|
||||
(
|
||||
"user",
|
||||
models.ForeignKey(
|
||||
blank=True,
|
||||
null=True,
|
||||
on_delete=django.db.models.deletion.SET_NULL,
|
||||
related_name="users",
|
||||
to=settings.AUTH_USER_MODEL,
|
||||
verbose_name="user",
|
||||
),
|
||||
),
|
||||
],
|
||||
options={
|
||||
"verbose_name": "comment",
|
||||
"verbose_name_plural": "comments",
|
||||
"ordering": ("created",),
|
||||
},
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name="document",
|
||||
name="original_filename",
|
||||
field=models.CharField(
|
||||
default=None,
|
||||
editable=False,
|
||||
help_text="The original name of the file when it was uploaded",
|
||||
max_length=1024,
|
||||
null=True,
|
||||
verbose_name="original filename",
|
||||
),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="savedviewfilterrule",
|
||||
name="rule_type",
|
||||
field=models.PositiveIntegerField(
|
||||
choices=[
|
||||
(0, "title contains"),
|
||||
(1, "content contains"),
|
||||
(2, "ASN is"),
|
||||
(3, "correspondent is"),
|
||||
(4, "document type is"),
|
||||
(5, "is in inbox"),
|
||||
(6, "has tag"),
|
||||
(7, "has any tag"),
|
||||
(8, "created before"),
|
||||
(9, "created after"),
|
||||
(10, "created year is"),
|
||||
(11, "created month is"),
|
||||
(12, "created day is"),
|
||||
(13, "added before"),
|
||||
(14, "added after"),
|
||||
(15, "modified before"),
|
||||
(16, "modified after"),
|
||||
(17, "does not have tag"),
|
||||
(18, "does not have ASN"),
|
||||
(19, "title or content contains"),
|
||||
(20, "fulltext query"),
|
||||
(21, "more like this"),
|
||||
(22, "has tags in"),
|
||||
(23, "ASN greater than"),
|
||||
(24, "ASN less than"),
|
||||
(25, "storage path is"),
|
||||
],
|
||||
verbose_name="rule type",
|
||||
),
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name="PaperlessTask",
|
||||
fields=[
|
||||
(
|
||||
"id",
|
||||
models.AutoField(
|
||||
auto_created=True,
|
||||
primary_key=True,
|
||||
serialize=False,
|
||||
verbose_name="ID",
|
||||
),
|
||||
),
|
||||
("task_id", models.CharField(max_length=128)),
|
||||
("acknowledged", models.BooleanField(default=False)),
|
||||
(
|
||||
"attempted_task",
|
||||
models.OneToOneField(
|
||||
blank=True,
|
||||
null=True,
|
||||
on_delete=django.db.models.deletion.CASCADE,
|
||||
related_name="attempted_task",
|
||||
to="django_celery_results.taskresult",
|
||||
),
|
||||
),
|
||||
],
|
||||
),
|
||||
migrations.RunSQL(
|
||||
sql="DROP TABLE IF EXISTS django_q_ormq",
|
||||
reverse_sql="",
|
||||
),
|
||||
migrations.RunSQL(
|
||||
sql="DROP TABLE IF EXISTS django_q_schedule",
|
||||
reverse_sql="",
|
||||
),
|
||||
migrations.RunSQL(
|
||||
sql="DROP TABLE IF EXISTS django_q_task",
|
||||
reverse_sql="",
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name="paperlesstask",
|
||||
name="attempted_task",
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name="paperlesstask",
|
||||
name="date_created",
|
||||
field=models.DateTimeField(
|
||||
default=django.utils.timezone.now,
|
||||
help_text="Datetime field when the task result was created in UTC",
|
||||
null=True,
|
||||
verbose_name="Created DateTime",
|
||||
),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name="paperlesstask",
|
||||
name="date_done",
|
||||
field=models.DateTimeField(
|
||||
default=None,
|
||||
help_text="Datetime field when the task was completed in UTC",
|
||||
null=True,
|
||||
verbose_name="Completed DateTime",
|
||||
),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name="paperlesstask",
|
||||
name="date_started",
|
||||
field=models.DateTimeField(
|
||||
default=None,
|
||||
help_text="Datetime field when the task was started in UTC",
|
||||
null=True,
|
||||
verbose_name="Started DateTime",
|
||||
),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name="paperlesstask",
|
||||
name="result",
|
||||
field=models.TextField(
|
||||
default=None,
|
||||
help_text="The data returned by the task",
|
||||
null=True,
|
||||
verbose_name="Result Data",
|
||||
),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name="paperlesstask",
|
||||
name="status",
|
||||
field=models.CharField(
|
||||
choices=[
|
||||
("FAILURE", "FAILURE"),
|
||||
("PENDING", "PENDING"),
|
||||
("RECEIVED", "RECEIVED"),
|
||||
("RETRY", "RETRY"),
|
||||
("REVOKED", "REVOKED"),
|
||||
("STARTED", "STARTED"),
|
||||
("SUCCESS", "SUCCESS"),
|
||||
],
|
||||
default="PENDING",
|
||||
help_text="Current state of the task being run",
|
||||
max_length=30,
|
||||
verbose_name="Task State",
|
||||
),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name="paperlesstask",
|
||||
name="task_name",
|
||||
field=models.CharField(
|
||||
help_text="Name of the Task which was run",
|
||||
max_length=255,
|
||||
null=True,
|
||||
verbose_name="Task Name",
|
||||
),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="paperlesstask",
|
||||
name="acknowledged",
|
||||
field=models.BooleanField(
|
||||
default=False,
|
||||
help_text="If the task is acknowledged via the frontend or API",
|
||||
verbose_name="Acknowledged",
|
||||
),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="paperlesstask",
|
||||
name="task_id",
|
||||
field=models.CharField(
|
||||
help_text="Celery ID for the Task that was run",
|
||||
max_length=255,
|
||||
unique=True,
|
||||
verbose_name="Task ID",
|
||||
),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="document",
|
||||
name="archive_serial_number",
|
||||
field=models.PositiveIntegerField(
|
||||
blank=True,
|
||||
db_index=True,
|
||||
help_text="The position of this document in your physical document archive.",
|
||||
null=True,
|
||||
unique=True,
|
||||
validators=[
|
||||
django.core.validators.MaxValueValidator(4294967295),
|
||||
django.core.validators.MinValueValidator(0),
|
||||
],
|
||||
verbose_name="archive serial number",
|
||||
),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name="paperlesstask",
|
||||
name="task_file_name",
|
||||
field=models.CharField(
|
||||
help_text="Name of the file which the Task was run for",
|
||||
max_length=255,
|
||||
null=True,
|
||||
verbose_name="Task Filename",
|
||||
),
|
||||
),
|
||||
migrations.RenameField(
|
||||
model_name="savedview",
|
||||
old_name="user",
|
||||
new_name="owner",
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="savedview",
|
||||
name="owner",
|
||||
field=models.ForeignKey(
|
||||
blank=True,
|
||||
null=True,
|
||||
on_delete=django.db.models.deletion.SET_NULL,
|
||||
to=settings.AUTH_USER_MODEL,
|
||||
verbose_name="owner",
|
||||
),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name="correspondent",
|
||||
name="owner",
|
||||
field=models.ForeignKey(
|
||||
blank=True,
|
||||
null=True,
|
||||
on_delete=django.db.models.deletion.SET_NULL,
|
||||
to=settings.AUTH_USER_MODEL,
|
||||
verbose_name="owner",
|
||||
),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name="document",
|
||||
name="owner",
|
||||
field=models.ForeignKey(
|
||||
blank=True,
|
||||
null=True,
|
||||
on_delete=django.db.models.deletion.SET_NULL,
|
||||
to=settings.AUTH_USER_MODEL,
|
||||
verbose_name="owner",
|
||||
),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name="documenttype",
|
||||
name="owner",
|
||||
field=models.ForeignKey(
|
||||
blank=True,
|
||||
null=True,
|
||||
on_delete=django.db.models.deletion.SET_NULL,
|
||||
to=settings.AUTH_USER_MODEL,
|
||||
verbose_name="owner",
|
||||
),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name="storagepath",
|
||||
name="owner",
|
||||
field=models.ForeignKey(
|
||||
blank=True,
|
||||
null=True,
|
||||
on_delete=django.db.models.deletion.SET_NULL,
|
||||
to=settings.AUTH_USER_MODEL,
|
||||
verbose_name="owner",
|
||||
),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name="tag",
|
||||
name="owner",
|
||||
field=models.ForeignKey(
|
||||
blank=True,
|
||||
null=True,
|
||||
on_delete=django.db.models.deletion.SET_NULL,
|
||||
to=settings.AUTH_USER_MODEL,
|
||||
verbose_name="owner",
|
||||
),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="correspondent",
|
||||
name="matching_algorithm",
|
||||
field=models.PositiveIntegerField(
|
||||
choices=[
|
||||
(0, "None"),
|
||||
(1, "Any word"),
|
||||
(2, "All words"),
|
||||
(3, "Exact match"),
|
||||
(4, "Regular expression"),
|
||||
(5, "Fuzzy word"),
|
||||
(6, "Automatic"),
|
||||
],
|
||||
default=1,
|
||||
verbose_name="matching algorithm",
|
||||
),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="documenttype",
|
||||
name="matching_algorithm",
|
||||
field=models.PositiveIntegerField(
|
||||
choices=[
|
||||
(0, "None"),
|
||||
(1, "Any word"),
|
||||
(2, "All words"),
|
||||
(3, "Exact match"),
|
||||
(4, "Regular expression"),
|
||||
(5, "Fuzzy word"),
|
||||
(6, "Automatic"),
|
||||
],
|
||||
default=1,
|
||||
verbose_name="matching algorithm",
|
||||
),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="storagepath",
|
||||
name="matching_algorithm",
|
||||
field=models.PositiveIntegerField(
|
||||
choices=[
|
||||
(0, "None"),
|
||||
(1, "Any word"),
|
||||
(2, "All words"),
|
||||
(3, "Exact match"),
|
||||
(4, "Regular expression"),
|
||||
(5, "Fuzzy word"),
|
||||
(6, "Automatic"),
|
||||
],
|
||||
default=1,
|
||||
verbose_name="matching algorithm",
|
||||
),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="tag",
|
||||
name="matching_algorithm",
|
||||
field=models.PositiveIntegerField(
|
||||
choices=[
|
||||
(0, "None"),
|
||||
(1, "Any word"),
|
||||
(2, "All words"),
|
||||
(3, "Exact match"),
|
||||
(4, "Regular expression"),
|
||||
(5, "Fuzzy word"),
|
||||
(6, "Automatic"),
|
||||
],
|
||||
default=1,
|
||||
verbose_name="matching algorithm",
|
||||
),
|
||||
),
|
||||
migrations.AlterModelOptions(
|
||||
name="documenttype",
|
||||
options={
|
||||
"ordering": ("name",),
|
||||
"verbose_name": "document type",
|
||||
"verbose_name_plural": "document types",
|
||||
},
|
||||
),
|
||||
migrations.AlterModelOptions(
|
||||
name="tag",
|
||||
options={
|
||||
"ordering": ("name",),
|
||||
"verbose_name": "tag",
|
||||
"verbose_name_plural": "tags",
|
||||
},
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="correspondent",
|
||||
name="name",
|
||||
field=models.CharField(max_length=128, verbose_name="name"),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="documenttype",
|
||||
name="name",
|
||||
field=models.CharField(max_length=128, verbose_name="name"),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="storagepath",
|
||||
name="name",
|
||||
field=models.CharField(max_length=128, verbose_name="name"),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="tag",
|
||||
name="name",
|
||||
field=models.CharField(max_length=128, verbose_name="name"),
|
||||
),
|
||||
migrations.AddConstraint(
|
||||
model_name="correspondent",
|
||||
constraint=models.UniqueConstraint(
|
||||
fields=("name", "owner"),
|
||||
name="documents_correspondent_unique_name_owner",
|
||||
),
|
||||
),
|
||||
migrations.AddConstraint(
|
||||
model_name="correspondent",
|
||||
constraint=models.UniqueConstraint(
|
||||
condition=models.Q(("owner__isnull", True)),
|
||||
fields=("name",),
|
||||
name="documents_correspondent_name_uniq",
|
||||
),
|
||||
),
|
||||
migrations.AddConstraint(
|
||||
model_name="documenttype",
|
||||
constraint=models.UniqueConstraint(
|
||||
fields=("name", "owner"),
|
||||
name="documents_documenttype_unique_name_owner",
|
||||
),
|
||||
),
|
||||
migrations.AddConstraint(
|
||||
model_name="documenttype",
|
||||
constraint=models.UniqueConstraint(
|
||||
condition=models.Q(("owner__isnull", True)),
|
||||
fields=("name",),
|
||||
name="documents_documenttype_name_uniq",
|
||||
),
|
||||
),
|
||||
migrations.AddConstraint(
|
||||
model_name="storagepath",
|
||||
constraint=models.UniqueConstraint(
|
||||
fields=("name", "owner"),
|
||||
name="documents_storagepath_unique_name_owner",
|
||||
),
|
||||
),
|
||||
migrations.AddConstraint(
|
||||
model_name="storagepath",
|
||||
constraint=models.UniqueConstraint(
|
||||
condition=models.Q(("owner__isnull", True)),
|
||||
fields=("name",),
|
||||
name="documents_storagepath_name_uniq",
|
||||
),
|
||||
),
|
||||
migrations.AddConstraint(
|
||||
model_name="tag",
|
||||
constraint=models.UniqueConstraint(
|
||||
fields=("name", "owner"),
|
||||
name="documents_tag_unique_name_owner",
|
||||
),
|
||||
),
|
||||
migrations.AddConstraint(
|
||||
model_name="tag",
|
||||
constraint=models.UniqueConstraint(
|
||||
condition=models.Q(("owner__isnull", True)),
|
||||
fields=("name",),
|
||||
name="documents_tag_name_uniq",
|
||||
),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="savedviewfilterrule",
|
||||
name="rule_type",
|
||||
field=models.PositiveIntegerField(
|
||||
choices=[
|
||||
(0, "title contains"),
|
||||
(1, "content contains"),
|
||||
(2, "ASN is"),
|
||||
(3, "correspondent is"),
|
||||
(4, "document type is"),
|
||||
(5, "is in inbox"),
|
||||
(6, "has tag"),
|
||||
(7, "has any tag"),
|
||||
(8, "created before"),
|
||||
(9, "created after"),
|
||||
(10, "created year is"),
|
||||
(11, "created month is"),
|
||||
(12, "created day is"),
|
||||
(13, "added before"),
|
||||
(14, "added after"),
|
||||
(15, "modified before"),
|
||||
(16, "modified after"),
|
||||
(17, "does not have tag"),
|
||||
(18, "does not have ASN"),
|
||||
(19, "title or content contains"),
|
||||
(20, "fulltext query"),
|
||||
(21, "more like this"),
|
||||
(22, "has tags in"),
|
||||
(23, "ASN greater than"),
|
||||
(24, "ASN less than"),
|
||||
(25, "storage path is"),
|
||||
(26, "has correspondent in"),
|
||||
(27, "does not have correspondent in"),
|
||||
(28, "has document type in"),
|
||||
(29, "does not have document type in"),
|
||||
(30, "has storage path in"),
|
||||
(31, "does not have storage path in"),
|
||||
],
|
||||
verbose_name="rule type",
|
||||
),
|
||||
),
|
||||
migrations.RenameModel(
|
||||
old_name="Comment",
|
||||
new_name="Note",
|
||||
),
|
||||
migrations.RenameField(
|
||||
model_name="note",
|
||||
old_name="comment",
|
||||
new_name="note",
|
||||
),
|
||||
migrations.AlterModelOptions(
|
||||
name="note",
|
||||
options={
|
||||
"ordering": ("created",),
|
||||
"verbose_name": "note",
|
||||
"verbose_name_plural": "notes",
|
||||
},
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="note",
|
||||
name="document",
|
||||
field=models.ForeignKey(
|
||||
blank=True,
|
||||
null=True,
|
||||
on_delete=django.db.models.deletion.CASCADE,
|
||||
related_name="notes",
|
||||
to="documents.document",
|
||||
verbose_name="document",
|
||||
),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="note",
|
||||
name="note",
|
||||
field=models.TextField(
|
||||
blank=True,
|
||||
help_text="Note for the document",
|
||||
verbose_name="content",
|
||||
),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="note",
|
||||
name="user",
|
||||
field=models.ForeignKey(
|
||||
blank=True,
|
||||
null=True,
|
||||
on_delete=django.db.models.deletion.SET_NULL,
|
||||
related_name="notes",
|
||||
to=settings.AUTH_USER_MODEL,
|
||||
verbose_name="user",
|
||||
),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="savedviewfilterrule",
|
||||
name="rule_type",
|
||||
field=models.PositiveIntegerField(
|
||||
choices=[
|
||||
(0, "title contains"),
|
||||
(1, "content contains"),
|
||||
(2, "ASN is"),
|
||||
(3, "correspondent is"),
|
||||
(4, "document type is"),
|
||||
(5, "is in inbox"),
|
||||
(6, "has tag"),
|
||||
(7, "has any tag"),
|
||||
(8, "created before"),
|
||||
(9, "created after"),
|
||||
(10, "created year is"),
|
||||
(11, "created month is"),
|
||||
(12, "created day is"),
|
||||
(13, "added before"),
|
||||
(14, "added after"),
|
||||
(15, "modified before"),
|
||||
(16, "modified after"),
|
||||
(17, "does not have tag"),
|
||||
(18, "does not have ASN"),
|
||||
(19, "title or content contains"),
|
||||
(20, "fulltext query"),
|
||||
(21, "more like this"),
|
||||
(22, "has tags in"),
|
||||
(23, "ASN greater than"),
|
||||
(24, "ASN less than"),
|
||||
(25, "storage path is"),
|
||||
(26, "has correspondent in"),
|
||||
(27, "does not have correspondent in"),
|
||||
(28, "has document type in"),
|
||||
(29, "does not have document type in"),
|
||||
(30, "has storage path in"),
|
||||
(31, "does not have storage path in"),
|
||||
(32, "owner is"),
|
||||
(33, "has owner in"),
|
||||
(34, "does not have owner"),
|
||||
(35, "does not have owner in"),
|
||||
],
|
||||
verbose_name="rule type",
|
||||
),
|
||||
),
|
||||
]
|
||||
70
src/documents/migrations/1023_add_comments.py
Normal file
70
src/documents/migrations/1023_add_comments.py
Normal file
@@ -0,0 +1,70 @@
|
||||
import django.utils.timezone
|
||||
from django.conf import settings
|
||||
from django.db import migrations
|
||||
from django.db import models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
("documents", "1022_paperlesstask"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.CreateModel(
|
||||
name="Comment",
|
||||
fields=[
|
||||
(
|
||||
"id",
|
||||
models.AutoField(
|
||||
auto_created=True,
|
||||
primary_key=True,
|
||||
serialize=False,
|
||||
verbose_name="ID",
|
||||
),
|
||||
),
|
||||
(
|
||||
"comment",
|
||||
models.TextField(
|
||||
blank=True,
|
||||
help_text="Comment for the document",
|
||||
verbose_name="content",
|
||||
),
|
||||
),
|
||||
(
|
||||
"created",
|
||||
models.DateTimeField(
|
||||
db_index=True,
|
||||
default=django.utils.timezone.now,
|
||||
verbose_name="created",
|
||||
),
|
||||
),
|
||||
(
|
||||
"document",
|
||||
models.ForeignKey(
|
||||
blank=True,
|
||||
null=True,
|
||||
on_delete=django.db.models.deletion.CASCADE,
|
||||
related_name="documents",
|
||||
to="documents.document",
|
||||
verbose_name="document",
|
||||
),
|
||||
),
|
||||
(
|
||||
"user",
|
||||
models.ForeignKey(
|
||||
blank=True,
|
||||
null=True,
|
||||
on_delete=django.db.models.deletion.SET_NULL,
|
||||
related_name="users",
|
||||
to=settings.AUTH_USER_MODEL,
|
||||
verbose_name="user",
|
||||
),
|
||||
),
|
||||
],
|
||||
options={
|
||||
"verbose_name": "comment",
|
||||
"verbose_name_plural": "comments",
|
||||
"ordering": ("created",),
|
||||
},
|
||||
),
|
||||
]
|
||||
25
src/documents/migrations/1024_document_original_filename.py
Normal file
25
src/documents/migrations/1024_document_original_filename.py
Normal file
@@ -0,0 +1,25 @@
|
||||
# Generated by Django 4.0.6 on 2022-07-25 06:34
|
||||
|
||||
from django.db import migrations
|
||||
from django.db import models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
("documents", "1023_add_comments"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AddField(
|
||||
model_name="document",
|
||||
name="original_filename",
|
||||
field=models.CharField(
|
||||
default=None,
|
||||
editable=False,
|
||||
help_text="The original name of the file when it was uploaded",
|
||||
max_length=1024,
|
||||
null=True,
|
||||
verbose_name="original filename",
|
||||
),
|
||||
),
|
||||
]
|
||||
@@ -0,0 +1,48 @@
|
||||
# Generated by Django 4.0.5 on 2022-08-26 16:49
|
||||
|
||||
from django.db import migrations
|
||||
from django.db import models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
("documents", "1024_document_original_filename"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name="savedviewfilterrule",
|
||||
name="rule_type",
|
||||
field=models.PositiveIntegerField(
|
||||
choices=[
|
||||
(0, "title contains"),
|
||||
(1, "content contains"),
|
||||
(2, "ASN is"),
|
||||
(3, "correspondent is"),
|
||||
(4, "document type is"),
|
||||
(5, "is in inbox"),
|
||||
(6, "has tag"),
|
||||
(7, "has any tag"),
|
||||
(8, "created before"),
|
||||
(9, "created after"),
|
||||
(10, "created year is"),
|
||||
(11, "created month is"),
|
||||
(12, "created day is"),
|
||||
(13, "added before"),
|
||||
(14, "added after"),
|
||||
(15, "modified before"),
|
||||
(16, "modified after"),
|
||||
(17, "does not have tag"),
|
||||
(18, "does not have ASN"),
|
||||
(19, "title or content contains"),
|
||||
(20, "fulltext query"),
|
||||
(21, "more like this"),
|
||||
(22, "has tags in"),
|
||||
(23, "ASN greater than"),
|
||||
(24, "ASN less than"),
|
||||
(25, "storage path is"),
|
||||
],
|
||||
verbose_name="rule type",
|
||||
),
|
||||
),
|
||||
]
|
||||
60
src/documents/migrations/1026_transition_to_celery.py
Normal file
60
src/documents/migrations/1026_transition_to_celery.py
Normal file
@@ -0,0 +1,60 @@
|
||||
# Generated by Django 4.1.1 on 2022-09-27 19:31
|
||||
|
||||
import django.db.models.deletion
|
||||
from django.db import migrations
|
||||
from django.db import models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
("django_celery_results", "0011_taskresult_periodic_task_name"),
|
||||
("documents", "1025_alter_savedviewfilterrule_rule_type"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.RemoveField(
|
||||
model_name="paperlesstask",
|
||||
name="created",
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name="paperlesstask",
|
||||
name="name",
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name="paperlesstask",
|
||||
name="started",
|
||||
),
|
||||
# Remove the field from the model
|
||||
migrations.RemoveField(
|
||||
model_name="paperlesstask",
|
||||
name="attempted_task",
|
||||
),
|
||||
# Add the field back, pointing to the correct model
|
||||
# This resolves a problem where the temporary change in 1022
|
||||
# results in a type mismatch
|
||||
migrations.AddField(
|
||||
model_name="paperlesstask",
|
||||
name="attempted_task",
|
||||
field=models.OneToOneField(
|
||||
blank=True,
|
||||
null=True,
|
||||
on_delete=django.db.models.deletion.CASCADE,
|
||||
related_name="attempted_task",
|
||||
to="django_celery_results.taskresult",
|
||||
),
|
||||
),
|
||||
# Drop the django-q tables entirely
|
||||
# Must be done last or there could be references here
|
||||
migrations.RunSQL(
|
||||
"DROP TABLE IF EXISTS django_q_ormq",
|
||||
reverse_sql=migrations.RunSQL.noop,
|
||||
),
|
||||
migrations.RunSQL(
|
||||
"DROP TABLE IF EXISTS django_q_schedule",
|
||||
reverse_sql=migrations.RunSQL.noop,
|
||||
),
|
||||
migrations.RunSQL(
|
||||
"DROP TABLE IF EXISTS django_q_task",
|
||||
reverse_sql=migrations.RunSQL.noop,
|
||||
),
|
||||
]
|
||||
@@ -0,0 +1,134 @@
|
||||
# Generated by Django 4.1.2 on 2022-10-17 16:31
|
||||
|
||||
import django.utils.timezone
|
||||
from django.db import migrations
|
||||
from django.db import models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
("documents", "1026_transition_to_celery"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.RemoveField(
|
||||
model_name="paperlesstask",
|
||||
name="attempted_task",
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name="paperlesstask",
|
||||
name="date_created",
|
||||
field=models.DateTimeField(
|
||||
default=django.utils.timezone.now,
|
||||
help_text="Datetime field when the task result was created in UTC",
|
||||
null=True,
|
||||
verbose_name="Created DateTime",
|
||||
),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name="paperlesstask",
|
||||
name="date_done",
|
||||
field=models.DateTimeField(
|
||||
default=None,
|
||||
help_text="Datetime field when the task was completed in UTC",
|
||||
null=True,
|
||||
verbose_name="Completed DateTime",
|
||||
),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name="paperlesstask",
|
||||
name="date_started",
|
||||
field=models.DateTimeField(
|
||||
default=None,
|
||||
help_text="Datetime field when the task was started in UTC",
|
||||
null=True,
|
||||
verbose_name="Started DateTime",
|
||||
),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name="paperlesstask",
|
||||
name="result",
|
||||
field=models.TextField(
|
||||
default=None,
|
||||
help_text="The data returned by the task",
|
||||
null=True,
|
||||
verbose_name="Result Data",
|
||||
),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name="paperlesstask",
|
||||
name="status",
|
||||
field=models.CharField(
|
||||
choices=[
|
||||
("FAILURE", "FAILURE"),
|
||||
("PENDING", "PENDING"),
|
||||
("RECEIVED", "RECEIVED"),
|
||||
("RETRY", "RETRY"),
|
||||
("REVOKED", "REVOKED"),
|
||||
("STARTED", "STARTED"),
|
||||
("SUCCESS", "SUCCESS"),
|
||||
],
|
||||
default="PENDING",
|
||||
help_text="Current state of the task being run",
|
||||
max_length=30,
|
||||
verbose_name="Task State",
|
||||
),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name="paperlesstask",
|
||||
name="task_args",
|
||||
field=models.JSONField(
|
||||
help_text="JSON representation of the positional arguments used with the task",
|
||||
null=True,
|
||||
verbose_name="Task Positional Arguments",
|
||||
),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name="paperlesstask",
|
||||
name="task_file_name",
|
||||
field=models.CharField(
|
||||
help_text="Name of the file which the Task was run for",
|
||||
max_length=255,
|
||||
null=True,
|
||||
verbose_name="Task Name",
|
||||
),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name="paperlesstask",
|
||||
name="task_kwargs",
|
||||
field=models.JSONField(
|
||||
help_text="JSON representation of the named arguments used with the task",
|
||||
null=True,
|
||||
verbose_name="Task Named Arguments",
|
||||
),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name="paperlesstask",
|
||||
name="task_name",
|
||||
field=models.CharField(
|
||||
help_text="Name of the Task which was run",
|
||||
max_length=255,
|
||||
null=True,
|
||||
verbose_name="Task Name",
|
||||
),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="paperlesstask",
|
||||
name="acknowledged",
|
||||
field=models.BooleanField(
|
||||
default=False,
|
||||
help_text="If the task is acknowledged via the frontend or API",
|
||||
verbose_name="Acknowledged",
|
||||
),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="paperlesstask",
|
||||
name="task_id",
|
||||
field=models.CharField(
|
||||
help_text="Celery ID for the Task that was run",
|
||||
max_length=255,
|
||||
unique=True,
|
||||
verbose_name="Task ID",
|
||||
),
|
||||
),
|
||||
]
|
||||
@@ -0,0 +1,20 @@
|
||||
# Generated by Django 4.1.3 on 2022-11-22 17:50
|
||||
|
||||
from django.db import migrations
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
("documents", "1027_remove_paperlesstask_attempted_task_and_more"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.RemoveField(
|
||||
model_name="paperlesstask",
|
||||
name="task_args",
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name="paperlesstask",
|
||||
name="task_kwargs",
|
||||
),
|
||||
]
|
||||
@@ -0,0 +1,30 @@
|
||||
# Generated by Django 4.1.4 on 2023-01-24 17:56
|
||||
|
||||
import django.core.validators
|
||||
from django.db import migrations
|
||||
from django.db import models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
("documents", "1028_remove_paperlesstask_task_args_and_more"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name="document",
|
||||
name="archive_serial_number",
|
||||
field=models.PositiveIntegerField(
|
||||
blank=True,
|
||||
db_index=True,
|
||||
help_text="The position of this document in your physical document archive.",
|
||||
null=True,
|
||||
unique=True,
|
||||
validators=[
|
||||
django.core.validators.MaxValueValidator(4294967295),
|
||||
django.core.validators.MinValueValidator(0),
|
||||
],
|
||||
verbose_name="archive serial number",
|
||||
),
|
||||
),
|
||||
]
|
||||
@@ -0,0 +1,23 @@
|
||||
# Generated by Django 4.1.5 on 2023-02-03 21:53
|
||||
|
||||
from django.db import migrations
|
||||
from django.db import models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
("documents", "1029_alter_document_archive_serial_number"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name="paperlesstask",
|
||||
name="task_file_name",
|
||||
field=models.CharField(
|
||||
help_text="Name of the file which the Task was run for",
|
||||
max_length=255,
|
||||
null=True,
|
||||
verbose_name="Task Filename",
|
||||
),
|
||||
),
|
||||
]
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user