diff --git a/CHANGELOG.md b/CHANGELOG.md index 8a40ddb8..6f11e3bc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ -## 0.0.80-dev0 +## 0.0.80 -* Add `include_slide_notes` parameter, indicating whether slide notes in `ppt` and `pptx` files should be partitioned. Default is `True`. Now, when slide notes are present in the file, they will be included alongside other elements, which may shift the index numbers of non-note elements. +* Bump to `unstructured` 0.15.10 +* Add `include_slide_notes` parameter, indicating whether slide notes in `ppt` and `pptx` files should be partitioned. Default is `True`. Now, when slide notes are present in the file, they will be included alongside other elements, which may shift the index numbers of non-note elements. ## 0.0.79 diff --git a/requirements/base.in b/requirements/base.in index 37be2785..7848c297 100644 --- a/requirements/base.in +++ b/requirements/base.in @@ -1,10 +1,14 @@ -c constraints.in -unstructured[all-docs]>=0.8.1 +unstructured[all-docs] # Pinning click due to a unicode issue in black # can remove after black drops support for Python 3.6 # ref: https://github.com/psf/black/issues/2964 click==8.1.3 -fastapi +# NOTE(robinson) - fastapi>=0.114.0 causes the test listed below to fail, though it +# works if data if chunking strategy and new_after_n_chars are explicitly set. Pinning +# for now to preserve behavior +# test_parallel_mode_preserves_uniqueness_of_hashes_when_asssembling_page_splits +fastapi<0.114.0 uvicorn ratelimit requests diff --git a/requirements/base.txt b/requirements/base.txt index d07ecd9b..089c02ff 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -20,13 +20,13 @@ beautifulsoup4==4.12.3 # via unstructured cachetools==5.5.0 # via google-auth -certifi==2024.7.4 +certifi==2024.8.30 # via # httpcore # httpx # requests # unstructured-client -cffi==1.17.0 +cffi==1.17.1 # via cryptography chardet==5.2.0 # via unstructured @@ -43,9 +43,9 @@ click==8.1.3 # uvicorn coloredlogs==15.0.1 # via onnxruntime -contourpy==1.2.1 +contourpy==1.3.0 # via matplotlib -cryptography==43.0.0 +cryptography==43.0.1 # via pdfminer-six cycler==0.12.1 # via matplotlib @@ -53,7 +53,7 @@ dataclasses-json==0.6.7 # via # unstructured # unstructured-client -deepdiff==7.0.1 +deepdiff==8.0.1 # via unstructured-client deprecated==1.2.14 # via pikepdf @@ -65,9 +65,9 @@ et-xmlfile==1.1.0 # via openpyxl exceptiongroup==1.2.2 # via anyio -fastapi==0.112.1 +fastapi==0.113.0 # via -r requirements/base.in -filelock==3.15.4 +filelock==3.16.0 # via # huggingface-hub # torch @@ -78,11 +78,11 @@ flatbuffers==24.3.25 # via onnxruntime fonttools==4.53.1 # via matplotlib -fsspec==2024.6.1 +fsspec==2024.9.0 # via # huggingface-hub # torch -google-api-core[grpc]==2.19.1 +google-api-core[grpc]==2.19.2 # via google-cloud-vision google-auth==2.34.0 # via @@ -90,15 +90,15 @@ google-auth==2.34.0 # google-cloud-vision google-cloud-vision==3.7.4 # via unstructured -googleapis-common-protos==1.63.2 +googleapis-common-protos==1.65.0 # via # google-api-core # grpcio-status -grpcio==1.65.5 +grpcio==1.66.1 # via # google-api-core # grpcio-status -grpcio-status==1.65.5 +grpcio-status==1.66.1 # via google-api-core h11==0.14.0 # via @@ -106,7 +106,7 @@ h11==0.14.0 # uvicorn httpcore==1.0.5 # via httpx -httpx==0.27.0 +httpx==0.27.2 # via unstructured-client huggingface-hub==0.24.6 # via @@ -116,7 +116,7 @@ huggingface-hub==0.24.6 # unstructured-inference humanfriendly==10.0 # via coloredlogs -idna==3.7 +idna==3.8 # via # anyio # httpx @@ -130,7 +130,7 @@ joblib==1.4.2 # via nltk jsonpath-python==1.0.6 # via unstructured-client -kiwisolver==1.4.5 +kiwisolver==1.4.7 # via matplotlib langdetect==1.0.9 # via unstructured @@ -191,7 +191,7 @@ onnx==1.16.2 # via # unstructured # unstructured-inference -onnxruntime==1.19.0 +onnxruntime==1.19.2 # via unstructured-inference opencv-python==4.10.0.84 # via @@ -199,7 +199,7 @@ opencv-python==4.10.0.84 # unstructured-inference openpyxl==3.1.5 # via unstructured -ordered-set==4.1.0 +orderly-set==5.2.2 # via deepdiff packaging==24.1 # via @@ -225,7 +225,9 @@ pdfminer-six==20231228 # unstructured pdfplumber==0.11.4 # via layoutparser -pikepdf==9.1.1 +pi-heif==0.18.0 + # via unstructured +pikepdf==9.2.1 # via unstructured pillow==10.4.0 # via @@ -233,20 +235,18 @@ pillow==10.4.0 # matplotlib # pdf2image # pdfplumber + # pi-heif # pikepdf - # pillow-heif # python-pptx # torchvision # unstructured-pytesseract -pillow-heif==0.18.0 - # via unstructured portalocker==2.10.1 # via iopath proto-plus==1.24.0 # via # google-api-core # google-cloud-vision -protobuf==5.27.3 +protobuf==5.28.0 # via # google-api-core # google-cloud-vision @@ -271,13 +271,13 @@ pycparser==2.22 # via cffi pycryptodome==3.20.0 # via -r requirements/base.in -pydantic==2.8.2 +pydantic==2.9.1 # via fastapi -pydantic-core==2.20.1 +pydantic-core==2.23.3 # via pydantic pypandoc==1.13 # via unstructured -pyparsing==3.1.2 +pyparsing==3.1.4 # via matplotlib pypdf==4.3.1 # via @@ -312,7 +312,7 @@ pyyaml==6.0.2 # omegaconf # timm # transformers -rapidfuzz==3.9.6 +rapidfuzz==3.9.7 # via # unstructured # unstructured-inference @@ -335,11 +335,11 @@ requests-toolbelt==1.0.0 # via unstructured-client rsa==4.9 # via google-auth -safetensors==0.4.4 +safetensors==0.4.5 # via # timm # transformers -scipy==1.14.0 +scipy==1.14.1 # via layoutparser six==1.16.0 # via @@ -352,7 +352,7 @@ sniffio==1.3.1 # httpx soupsieve==2.6 # via beautifulsoup4 -starlette==0.38.2 +starlette==0.38.5 # via fastapi sympy==1.13.2 # via @@ -360,19 +360,19 @@ sympy==1.13.2 # torch tabulate==0.9.0 # via unstructured -timm==1.0.8 +timm==1.0.9 # via # effdet # unstructured-inference tokenizers==0.19.1 # via transformers -torch==2.4.0 +torch==2.4.1 # via # effdet # timm # torchvision # unstructured-inference -torchvision==0.19.0 +torchvision==0.19.1 # via # effdet # timm @@ -383,7 +383,7 @@ tqdm==4.66.5 # nltk # transformers # unstructured -transformers==4.44.1 +transformers==4.44.2 # via unstructured-inference typing-extensions==4.12.2 # via @@ -409,9 +409,9 @@ typing-inspect==0.9.0 # unstructured-client tzdata==2024.1 # via pandas -unstructured[all-docs]==0.15.7 +unstructured[all-docs]==0.15.10 # via -r requirements/base.in -unstructured-client==0.25.5 +unstructured-client==0.25.8 # via unstructured unstructured-inference==0.7.36 # via unstructured diff --git a/requirements/test.txt b/requirements/test.txt index 978b89d0..fff96f98 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -57,14 +57,14 @@ cachetools==5.5.0 # via # -r requirements/base.txt # google-auth -certifi==2024.7.4 +certifi==2024.8.30 # via # -r requirements/base.txt # httpcore # httpx # requests # unstructured-client -cffi==1.17.0 +cffi==1.17.1 # via # -r requirements/base.txt # argon2-cffi-bindings @@ -95,13 +95,13 @@ comm==0.2.2 # via # ipykernel # ipywidgets -contourpy==1.2.1 +contourpy==1.3.0 # via # -r requirements/base.txt # matplotlib coverage[toml]==7.6.1 # via pytest-cov -cryptography==43.0.0 +cryptography==43.0.1 # via # -r requirements/base.txt # pdfminer-six @@ -118,7 +118,7 @@ debugpy==1.8.5 # via ipykernel decorator==5.1.1 # via ipython -deepdiff==7.0.1 +deepdiff==8.0.1 # via # -r requirements/base.txt # -r requirements/test.in @@ -149,18 +149,18 @@ exceptiongroup==1.2.2 # pytest execnb==0.1.6 # via nbdev -executing==2.0.1 +executing==2.1.0 # via stack-data -fastapi==0.112.1 +fastapi==0.113.0 # via -r requirements/base.txt -fastcore==1.7.1 +fastcore==1.7.5 # via # execnb # ghapi # nbdev fastjsonschema==2.20.0 # via nbformat -filelock==3.15.4 +filelock==3.16.0 # via # -r requirements/base.txt # huggingface-hub @@ -182,14 +182,14 @@ fonttools==4.53.1 # matplotlib fqdn==1.5.1 # via jsonschema -fsspec==2024.6.1 +fsspec==2024.9.0 # via # -r requirements/base.txt # huggingface-hub # torch -ghapi==1.0.5 +ghapi==1.0.6 # via nbdev -google-api-core[grpc]==2.19.1 +google-api-core[grpc]==2.19.2 # via # -r requirements/base.txt # google-cloud-vision @@ -202,17 +202,17 @@ google-cloud-vision==3.7.4 # via # -r requirements/base.txt # unstructured -googleapis-common-protos==1.63.2 +googleapis-common-protos==1.65.0 # via # -r requirements/base.txt # google-api-core # grpcio-status -grpcio==1.65.5 +grpcio==1.66.1 # via # -r requirements/base.txt # google-api-core # grpcio-status -grpcio-status==1.65.5 +grpcio-status==1.66.1 # via # -r requirements/base.txt # google-api-core @@ -225,7 +225,7 @@ httpcore==1.0.5 # via # -r requirements/base.txt # httpx -httpx==0.27.0 +httpx==0.27.2 # via # -r requirements/base.txt # -r requirements/test.in @@ -242,7 +242,7 @@ humanfriendly==10.0 # via # -r requirements/base.txt # coloredlogs -idna==3.7 +idna==3.8 # via # -r requirements/base.txt # anyio @@ -261,14 +261,13 @@ ipykernel==6.29.5 # jupyter # jupyter-console # jupyterlab - # qtconsole -ipython==8.26.0 +ipython==8.27.0 # via # execnb # ipykernel # ipywidgets # jupyter-console -ipywidgets==8.1.3 +ipywidgets==8.1.5 # via jupyter isoduration==20.11.0 # via jsonschema @@ -301,7 +300,7 @@ jsonschema[format-nongpl]==4.23.0 # nbformat jsonschema-specifications==2023.12.1 # via jsonschema -jupyter==1.0.0 +jupyter==1.1.1 # via -r requirements/test.in jupyter-client==8.6.2 # via @@ -309,7 +308,6 @@ jupyter-client==8.6.2 # jupyter-console # jupyter-server # nbclient - # qtconsole jupyter-console==6.6.3 # via jupyter jupyter-core==5.7.2 @@ -322,7 +320,6 @@ jupyter-core==5.7.2 # nbclient # nbconvert # nbformat - # qtconsole jupyter-events==0.10.0 # via jupyter-server jupyter-lsp==2.2.5 @@ -336,17 +333,19 @@ jupyter-server==2.14.2 # notebook-shim jupyter-server-terminals==0.5.3 # via jupyter-server -jupyterlab==4.2.4 - # via notebook +jupyterlab==4.2.5 + # via + # jupyter + # notebook jupyterlab-pygments==0.3.0 # via nbconvert jupyterlab-server==2.27.3 # via # jupyterlab # notebook -jupyterlab-widgets==3.0.11 +jupyterlab-widgets==3.0.13 # via ipywidgets -kiwisolver==1.4.5 +kiwisolver==1.4.7 # via # -r requirements/base.txt # matplotlib @@ -396,7 +395,7 @@ mpmath==1.3.0 # via # -r requirements/base.txt # sympy -mypy==1.11.1 +mypy==1.11.2 # via -r requirements/test.in mypy-extensions==1.0.0 # via @@ -411,7 +410,7 @@ nbconvert==7.16.4 # via # jupyter # jupyter-server -nbdev==2.3.27 +nbdev==2.3.29 # via -r requirements/test.in nbformat==5.10.4 # via @@ -432,7 +431,7 @@ nltk==3.9.1 # via # -r requirements/base.txt # unstructured -notebook==7.2.1 +notebook==7.2.2 # via jupyter notebook-shim==0.2.4 # via @@ -467,7 +466,7 @@ onnx==1.16.2 # -r requirements/base.txt # unstructured # unstructured-inference -onnxruntime==1.19.0 +onnxruntime==1.19.2 # via # -r requirements/base.txt # unstructured-inference @@ -480,7 +479,7 @@ openpyxl==3.1.5 # via # -r requirements/base.txt # unstructured -ordered-set==4.1.0 +orderly-set==5.2.2 # via # -r requirements/base.txt # deepdiff @@ -504,8 +503,6 @@ packaging==24.1 # onnxruntime # pikepdf # pytest - # qtconsole - # qtpy # transformers # unstructured-client # unstructured-pytesseract @@ -536,7 +533,11 @@ pdfplumber==0.11.4 # layoutparser pexpect==4.9.0 # via ipython -pikepdf==9.1.1 +pi-heif==0.18.0 + # via + # -r requirements/base.txt + # unstructured +pikepdf==9.2.1 # via # -r requirements/base.txt # unstructured @@ -547,16 +548,12 @@ pillow==10.4.0 # matplotlib # pdf2image # pdfplumber + # pi-heif # pikepdf - # pillow-heif # python-pptx # torchvision # unstructured-pytesseract -pillow-heif==0.18.0 - # via - # -r requirements/base.txt - # unstructured -platformdirs==4.2.2 +platformdirs==4.3.2 # via # black # jupyter-core @@ -577,7 +574,7 @@ proto-plus==1.24.0 # -r requirements/base.txt # google-api-core # google-cloud-vision -protobuf==5.27.3 +protobuf==5.28.0 # via # -r requirements/base.txt # google-api-core @@ -619,11 +616,11 @@ pycparser==2.22 # cffi pycryptodome==3.20.0 # via -r requirements/base.txt -pydantic==2.8.2 +pydantic==2.9.1 # via # -r requirements/base.txt # fastapi -pydantic-core==2.20.1 +pydantic-core==2.23.3 # via # -r requirements/base.txt # pydantic @@ -634,12 +631,11 @@ pygments==2.18.0 # ipython # jupyter-console # nbconvert - # qtconsole pypandoc==1.13 # via # -r requirements/base.txt # unstructured -pyparsing==3.1.2 +pyparsing==3.1.4 # via # -r requirements/base.txt # matplotlib @@ -652,7 +648,7 @@ pypdfium2==4.30.0 # via # -r requirements/base.txt # pdfplumber -pytest==8.3.2 +pytest==8.3.3 # via # pytest-cov # pytest-mock @@ -708,18 +704,13 @@ pyyaml==6.0.2 # omegaconf # timm # transformers -pyzmq==26.1.1 +pyzmq==26.2.0 # via # ipykernel # jupyter-client # jupyter-console # jupyter-server - # qtconsole -qtconsole==5.5.2 - # via jupyter -qtpy==2.4.1 - # via qtconsole -rapidfuzz==3.9.6 +rapidfuzz==3.9.7 # via # -r requirements/base.txt # unstructured @@ -766,12 +757,12 @@ rsa==4.9 # via # -r requirements/base.txt # google-auth -safetensors==0.4.4 +safetensors==0.4.5 # via # -r requirements/base.txt # timm # transformers -scipy==1.14.0 +scipy==1.14.1 # via # -r requirements/base.txt # layoutparser @@ -798,7 +789,7 @@ soupsieve==2.6 # beautifulsoup4 stack-data==0.6.3 # via ipython -starlette==0.38.2 +starlette==0.38.5 # via # -r requirements/base.txt # fastapi @@ -815,7 +806,7 @@ terminado==0.18.1 # via # jupyter-server # jupyter-server-terminals -timm==1.0.8 +timm==1.0.9 # via # -r requirements/base.txt # effdet @@ -833,14 +824,14 @@ tomli==2.0.1 # jupyterlab # mypy # pytest -torch==2.4.0 +torch==2.4.1 # via # -r requirements/base.txt # effdet # timm # torchvision # unstructured-inference -torchvision==0.19.0 +torchvision==0.19.1 # via # -r requirements/base.txt # effdet @@ -877,12 +868,11 @@ traitlets==5.14.3 # nbclient # nbconvert # nbformat - # qtconsole -transformers==4.44.1 +transformers==4.44.2 # via # -r requirements/base.txt # unstructured-inference -types-python-dateutil==2.9.0.20240316 +types-python-dateutil==2.9.0.20240906 # via arrow typing-extensions==4.12.2 # via @@ -916,9 +906,9 @@ tzdata==2024.1 # via # -r requirements/base.txt # pandas -unstructured[all-docs]==0.15.7 +unstructured[all-docs]==0.15.10 # via -r requirements/base.txt -unstructured-client==0.25.5 +unstructured-client==0.25.8 # via # -r requirements/base.txt # unstructured @@ -939,7 +929,7 @@ urllib3==2.2.2 # unstructured-client uvicorn==0.30.6 # via -r requirements/base.txt -watchdog==4.0.2 +watchdog==5.0.2 # via nbdev wcwidth==0.2.13 # via prompt-toolkit @@ -953,7 +943,7 @@ websocket-client==1.8.0 # via jupyter-server wheel==0.44.0 # via astunparse -widgetsnbextension==4.0.11 +widgetsnbextension==4.0.13 # via ipywidgets wrapt==1.16.0 # via @@ -970,5 +960,4 @@ xlsxwriter==3.2.0 # python-pptx # The following packages are considered to be unsafe in a requirements file: -# pip # setuptools