Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
50 commits
Select commit Hold shift + click to select a range
9389af5
add output path to datalad run
dnl-bsch Nov 26, 2024
7ecf7cd
first try dowload in ci
dnl-bsch Nov 26, 2024
f840193
ssh keys for gin
dnl-bsch Nov 26, 2024
d9658ff
git clone
dnl-bsch Nov 26, 2024
f9bf164
checkout repo
dnl-bsch Nov 26, 2024
599c5f6
check out repo
dnl-bsch Nov 26, 2024
1742527
clone from gin
dnl-bsch Nov 27, 2024
a26e861
clone with ssh datalad
dnl-bsch Nov 27, 2024
e38540f
clone with ssh datalad
dnl-bsch Nov 27, 2024
1982d87
reconfigure siblings
dnl-bsch Nov 27, 2024
79c2397
change more datalad settings
dnl-bsch Nov 27, 2024
1bdc584
more error messages
dnl-bsch Nov 27, 2024
2c5d18a
try again with ssh access
dnl-bsch Nov 27, 2024
e86b788
ssh
dnl-bsch Nov 27, 2024
77b156d
[DATALAD] Recorded changes
dnl-bsch Nov 27, 2024
5b82a64
[DATALAD] Recorded changes
dnl-bsch Nov 27, 2024
328636d
[DATALAD] Recorded changes
dnl-bsch Nov 27, 2024
0434541
[DATALAD] Recorded changes
dnl-bsch Nov 27, 2024
03c1c24
[DATALAD] Recorded changes
dnl-bsch Nov 27, 2024
7af3983
[DATALAD] Recorded changes
dnl-bsch Nov 27, 2024
665839b
[DATALAD] Recorded changes
dnl-bsch Nov 27, 2024
27d01e4
[DATALAD] Recorded changes
dnl-bsch Nov 27, 2024
0c739a9
[DATALAD] Recorded changes
dnl-bsch Nov 27, 2024
bf790fb
[DATALAD] Recorded changes
dnl-bsch Nov 27, 2024
09051bb
[DATALAD] Recorded changes
dnl-bsch Nov 27, 2024
7efe0fa
[DATALAD] Recorded changes
dnl-bsch Nov 27, 2024
9d47549
[DATALAD] Recorded changes
dnl-bsch Nov 27, 2024
56c1231
[DATALAD] Recorded changes
dnl-bsch Nov 27, 2024
67eec1f
[DATALAD] Recorded changes
dnl-bsch Nov 27, 2024
97e01e4
[DATALAD] Recorded changes
dnl-bsch Nov 27, 2024
c024cfc
[DATALAD] Recorded changes
dnl-bsch Nov 27, 2024
9c1e119
[DATALAD] Recorded changes
dnl-bsch Nov 27, 2024
dcb5a48
[DATALAD] Recorded changes
dnl-bsch Nov 27, 2024
43d5560
[DATALAD] Recorded changes
dnl-bsch Nov 27, 2024
3beba64
[DATALAD] Recorded changes
dnl-bsch Nov 27, 2024
61f1860
[DATALAD] Recorded changes
dnl-bsch Nov 27, 2024
cd6c263
[DATALAD RUNCMD] poetry run python3 scripts/download_all_...
dnl-bsch Nov 27, 2024
df5f4d1
[DATALAD] Recorded changes
dnl-bsch Nov 27, 2024
5ad4a97
[DATALAD] Recorded changes
dnl-bsch Nov 27, 2024
c242808
[DATALAD] Recorded changes
dnl-bsch Nov 28, 2024
35730ad
[DATALAD] Recorded changes
dnl-bsch Nov 28, 2024
f85c166
[DATALAD] Recorded changes
dnl-bsch Nov 28, 2024
116bc7e
[DATALAD] Recorded changes
dnl-bsch Nov 28, 2024
58661bf
Merge branch 'main' into fix-datalad-set-up
dnl-bsch Nov 28, 2024
aa41e26
[DATALAD] Recorded changes
dnl-bsch Nov 28, 2024
8c5fc1a
[DATALAD] Recorded changes
dnl-bsch Nov 28, 2024
17b6dff
[DATALAD] Recorded changes
dnl-bsch Nov 28, 2024
12889fd
[DATALAD] Recorded changes
dnl-bsch Nov 28, 2024
79b0d70
[DATALAD] Recorded changes
dnl-bsch Nov 28, 2024
da8dad1
Merge branch 'main' into fix-datalad-set-up
JGuetschow Jan 21, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name: CI

on:
pull_request:
# pull_request:
push:
branches: [main]
tags: ['v*']
Expand Down
53 changes: 53 additions & 0 deletions .github/workflows/download.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
name: download

on:
pull_request:
push:
branches: [main]
tags: ['v*']

jobs:
download:
runs-on: ubuntu-22.04
steps:
- name: install datalad
uses: awalsh128/cache-apt-pkgs-action@v1.4.2
with:
packages: datalad
version: 1.0
- name: Install poetry
shell: bash
run: |
pipx install poetry
which poetry
poetry --version # Check poetry installation
- name: configure git email
run: git config --global user.email "daniel.busch@climate-resource.com"
- name: configure git user
run: git config --global user.name "Daniel Busch (via github actions)"
- name: clone repo
run: datalad clone https://github.com/primap-community/FAOSTAT_data_primap.git
- name: setup Python
uses: actions/setup-python@v5
with:
python-version: '3.11'
- name: Set Poetry environment
shell: bash
run: |
cd FAOSTAT_data_primap/
poetry config virtualenvs.create true
poetry config virtualenvs.in-project true
poetry install --no-interaction --all-extras --only 'main,dev'
poetry run python --version # Check python version just in case
- name: download data
run: |
cd FAOSTAT_data_primap/
datalad run \
-o 'downloaded_data/farm_gate_emissions_crops/2024-11-14/*' \
-o 'downloaded_data/farm_gate_agriculture_energy/2024-11-14/*' \
-o 'downloaded_data/farm_gate_livestock/2024-11-14/*' \
-o 'downloaded_data/land_use_drained_organic_soils/2024-11-14/*' \
-o 'downloaded_data/land_use_fires/2024-11-14/*' \
-o 'downloaded_data/land_use_forests/2024-11-14/*' \
-o 'downloaded_data/pre_post_agricultural_production/2024-11-14/*' \
poetry run python3 scripts/download_all_domains.py
117 changes: 113 additions & 4 deletions src/faostat_data_primap/download.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Downloads data from FAOSTAT website."""

import hashlib
import os
import pathlib
import time
Expand All @@ -17,6 +18,78 @@
from faostat_data_primap.helper.paths import downloaded_data_path


def find_previous_release_path(
current_release_path: pathlib.Path,
) -> pathlib.Path | None:
"""
Find the most recent previous release directory within same domain

Release directories are assumed to be subdirectories within the same parent
directory as `current_release_path`. The Sorting is done alphabetically,
so directory names should follow the naming convention YYYY-MM-DD

Parameters
----------
current_release_path : pathlib.Path
The path of the current release directory.

Returns
-------
pathlib.Path or None
Returns the path of the most recent previous release directory if one exists,
otherwise returns None.
"""
domain_path = current_release_path.parent
all_releases = [
release_name
for release_name in os.listdir(current_release_path.parent)
if (domain_path / release_name).is_dir()
]

# make sure all directories follow the naming convention
try:
all_releases_datetime = [
datetime.strptime(release, "%Y-%m-%d") for release in all_releases
]
except ValueError as e:
msg = (
"All release folders must be in YYYY-MM-DD format, "
f"got {sorted(all_releases)}"
)
raise ValueError(msg) from e

all_releases_datetime = sorted(all_releases_datetime)
current_release_datetime = datetime.strptime(current_release_path.name, "%Y-%m-%d")
index = all_releases_datetime.index(current_release_datetime)

# if the current release is the latest or the only one
if index == 0:
return None

return domain_path / all_releases_datetime[index - 1].strftime("%Y-%m-%d")


def calculate_checksum(file_path: pathlib.Path) -> str:
"""
Calculate the SHA-256 checksum of a file.

Parameters
----------
file_path : pathlib.Path
The path to the file for which the checksum is calculated.

Returns
-------
str
The SHA-256 checksum of the file as a hexadecimal string.
"""
sha256 = hashlib.sha256()
with open(file_path, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
sha256.update(chunk)
return sha256.hexdigest()


def download_methodology(url_download: str, save_path: pathlib.Path) -> None:
"""
Download methodology file.
Expand All @@ -41,17 +114,53 @@ def download_methodology(url_download: str, save_path: pathlib.Path) -> None:
filename = url_download.split("/")[-1]
download_path = save_path / filename

if not save_path.exists():
save_path.mkdir()

if download_path.exists():
if download_path.is_symlink():
os.remove(download_path)
else:
print(f"Skipping download of {download_path} because it already exists.")
return

response = requests.get(url_download, stream=True, timeout=30)
response.raise_for_status()
with open(download_path, "wb") as f:
f.write(response.content)
previous_release = find_previous_release_path(save_path)
# Attempt to find a file to compare in the previous release
if previous_release:
file_to_compare = previous_release / filename
if file_to_compare.exists():
response = requests.get(url_download, stream=True, timeout=30)
response.raise_for_status()
file_to_download_checksum = hashlib.sha256(response.content).hexdigest()
file_to_compare_checksum = calculate_checksum(file_to_compare)

if file_to_download_checksum == file_to_compare_checksum:
print(
f"File '{filename}' is identical in the previous release. "
f"Creating symlink."
)
os.symlink(file_to_compare, download_path)
return
else:
print(
f"File '{filename}' differs from previous release. "
f"Downloading file."
)
else:
print(f"File '{filename}' not found in previous release. Downloading file.")
response = requests.get(url_download, stream=True, timeout=30)
response.raise_for_status()

# Save downloaded file to current release
with open(download_path, "wb") as f:
f.write(response.content)

else:
print(f"No previous release found. Downloading file '{filename}'.")
response = requests.get(url_download, stream=True, timeout=30)
response.raise_for_status()
with open(download_path, "wb") as f:
f.write(response.content)


def get_html_content(url: str) -> BeautifulSoup:
Expand Down
Loading