curl-curl/scripts/wcurl

#!/bin/sh

# wcurl - a simple wrapper around curl to easily download files.
#
# Requires curl >= 7.46.0 (2015)
#
# Copyright (C) Samuel Henrique <samueloph@debian.org>, Sergio Durigan
# Junior <sergiodj@debian.org> and many contributors, see the AUTHORS
# file.
#
# Permission to use, copy, modify, and distribute this software for any purpose
# with or without fee is hereby granted, provided that the above copyright
# notice and this permission notice appear in all copies.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN
# NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
# OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
# OR OTHER DEALINGS IN THE SOFTWARE.
#
# Except as contained in this notice, the name of a copyright holder shall not be
# used in advertising or otherwise to promote the sale, use or other dealings in
# this Software without prior written authorization of the copyright holder.
#
# SPDX-License-Identifier: curl

# Stop on errors and on usage of unset variables.
set -eu

VERSION="2025.04.20"

PROGRAM_NAME="$(basename "$0")"
readonly PROGRAM_NAME

# Display the version.
print_version()
{
    cat << _EOF_
${VERSION}
_EOF_
}

# Display the program usage.
usage()
{
    cat << _EOF_
${PROGRAM_NAME} -- a simple wrapper around curl to easily download files.

Usage: ${PROGRAM_NAME} <URL>...
       ${PROGRAM_NAME} [--curl-options <CURL_OPTIONS>]... [--no-decode-filename] [-o|-O|--output <PATH>] [--dry-run] [--] <URL>...
       ${PROGRAM_NAME} [--curl-options=<CURL_OPTIONS>]... [--no-decode-filename] [--output=<PATH>] [--dry-run] [--] <URL>...
       ${PROGRAM_NAME} -h|--help
       ${PROGRAM_NAME} -V|--version

Options:

  --curl-options <CURL_OPTIONS>: Specify extra options to be passed when invoking curl. May be
                                 specified more than once.

  -o, -O, --output <PATH>: Use the provided output path instead of getting it from the URL. If
                           multiple URLs are provided, resulting files share the same name with a
                           number appended to the end (curl >= 7.83.0). If this option is provided
                           multiple times, only the last value is considered.

  --no-decode-filename: Don't percent-decode the output filename, even if the percent-encoding in
                        the URL was done by wcurl, e.g.: The URL contained whitespaces.

  --dry-run: Don't actually execute curl, just print what would be invoked.

  -V, --version: Print version information.

  -h, --help: Print this usage message.

  <CURL_OPTIONS>: Any option supported by curl can be set here. This is not used by wcurl; it is
                 instead forwarded to the curl invocation.

  <URL>: URL to be downloaded. Anything that is not a parameter is considered
         an URL. Whitespaces are percent-encoded and the URL is passed to curl, which
         then performs the parsing. May be specified more than once.
_EOF_
}

# Display an error message and bail out.
error()
{
    printf "%s\n" "$*" > /dev/stderr
    exit 1
}

# Extra curl options provided by the user.
# This is set per-URL for every URL provided.
# Some options are global, but we are erroring on the side of needlesly setting
# them multiple times instead of causing issues with parameters that needs to
# be set per-URL.
CURL_OPTIONS=""

# The URLs to be downloaded.
URLS=""

# Variable used to be set to the percent-decoded filename parsed from the URL, unless
# --output or --no-decode-filename are used.
OUTPUT_PATH=""
HAS_USER_SET_OUTPUT="false"

# The parameters that are passed per-URL to curl.
readonly PER_URL_PARAMETERS="\
    --fail \
    --globoff \
    --location \
    --proto-default https \
    --remote-time \
    --retry 10 \
    --retry-max-time 10 "

# Whether to invoke curl or not.
DRY_RUN="false"

# Sanitize parameters.
sanitize()
{
    if [ -z "${URLS}" ]; then
        error "You must provide at least one URL to download."
    fi

    readonly CURL_OPTIONS URLS DRY_RUN HAS_USER_SET_OUTPUT
}

# Indicate via exit code whether the string given in the first parameter
# consists solely of characters from the string given in the second parameter.
# In other words, it returns 0 if the first parameter only contains characters
# from the second parameter, e.g.: Are $1 characters a subset of $2 characters?
is_subset_of()
{
    case "${1}" in
        *[!${2}]*|'') return 1;;
    esac
}

# Print the given string percent-decoded.
percent_decode()
{
    # Encodings of control characters (00-1F) are passed through without decoding.
    # Iterate on the input character-by-character, decoding it.
    printf "%s\n" "${1}" | fold -w1 | while IFS= read -r decode_out; do
        # If character is a "%", read the next character as decode_hex1.
        if [ "${decode_out}" = % ] && IFS= read -r decode_hex1; then
            decode_out="${decode_out}${decode_hex1}"
            # If there's one more character, read it as decode_hex2.
            if IFS= read -r decode_hex2; then
                decode_out="${decode_out}${decode_hex2}"
                # Skip decoding if this is a control character (00-1F).
                # Skip decoding if DECODE_FILENAME is not "true".
                if is_subset_of "${decode_hex1}" "23456789abcdefABCDEF" && \
                    is_subset_of "${decode_hex2}" "0123456789abcdefABCDEF" && \
                    [ "${DECODE_FILENAME}" = "true" ]; then
                    # Use printf to decode it into octal and then decode it to the final format.
                    decode_out="$(printf "%b" "\\$(printf %o "0x${decode_hex1}${decode_hex2}")")"
                fi
            fi
        fi
        printf %s "${decode_out}"
    done
}

# Print the percent-decoded filename portion of the given URL.
get_url_filename()
{
    # Remove protocol and query string if present.
    hostname_and_path="$(printf %s "${1}" | sed -e 's,^[^/]*//,,' -e 's,?.*$,,')"
    # If what remains contains a slash, there's a path; return it percent-decoded.
    case "${hostname_and_path}" in
        # sed to remove everything preceding the last '/', e.g.: "example/something" becomes "something"
        */*) percent_decode "$(printf %s "${hostname_and_path}" | sed -e 's,^.*/,,')";;
    esac
    # No slash means there was just a hostname and no path; return empty string.
}

# Execute curl with the list of URLs provided by the user.
exec_curl()
{
    CMD="curl "

    # Store version to check if it supports --no-clobber and --parallel.
    curl_version=$($CMD --version | cut -f2 -d' ' | head -n1)
    curl_version_major=$(echo "$curl_version" | cut -f1 -d.)
    curl_version_minor=$(echo "$curl_version" | cut -f2 -d.)

    CURL_HAS_NO_CLOBBER=""
    CURL_HAS_PARALLEL=""
    # --no-clobber is only supported since 7.83.0.
    # --parallel is only supported since 7.66.0.
    if [ "${curl_version_major}" -ge 8 ]; then
        CURL_HAS_NO_CLOBBER="--no-clobber"
        CURL_HAS_PARALLEL="--parallel"
    elif [ "${curl_version_major}" -eq 7 ];then
        if [ "${curl_version_minor}" -ge 83 ]; then
            CURL_HAS_NO_CLOBBER="--no-clobber"
        fi
        if [ "${curl_version_minor}" -ge 66 ]; then
            CURL_HAS_PARALLEL="--parallel"
        fi
    fi

    # Detecting whether we need --parallel.  It's easier to rely on
    # the shell's argument parsing.
    # shellcheck disable=SC2086
    set -- $URLS

    if [ "$#" -gt 1 ]; then
        CURL_PARALLEL="$CURL_HAS_PARALLEL"
    else
        CURL_PARALLEL=""
    fi

    # Start assembling the command.
    #
    # We use 'set --' here (again) because (a) we don't have arrays on
    # POSIX shell, and (b) we need better control over the way we
    # split arguments.
    #
    # shellcheck disable=SC2086
    set -- ${CMD} ${CURL_PARALLEL}

    NEXT_PARAMETER=""
    for url in ${URLS}; do
        # If the user did not provide an output path, define one.
        if [ "${HAS_USER_SET_OUTPUT}" = "false" ]; then
            OUTPUT_PATH="$(get_url_filename "${url}")"
            # If we could not get a path from the URL, use the default: index.html.
            [ -z "${OUTPUT_PATH}" ] && OUTPUT_PATH=index.html
        fi
        # shellcheck disable=SC2086
        set -- "$@" ${NEXT_PARAMETER} ${PER_URL_PARAMETERS} ${CURL_HAS_NO_CLOBBER} ${CURL_OPTIONS} --output "${OUTPUT_PATH}" "${url}"
        NEXT_PARAMETER="--next"
    done

    if [ "${DRY_RUN}" = "false" ]; then
        exec "$@"
    else
        printf "%s\n" "$@"
    fi
}

# Default to decoding the output filename
DECODE_FILENAME="true"

# Use "${1-}" in order to avoid errors because of 'set -u'.
while [ -n "${1-}" ]; do
    case "${1}" in
        --curl-options=*)
            opt=$(printf "%s\n" "${1}" | sed 's/^--curl-options=//')
            CURL_OPTIONS="${CURL_OPTIONS} ${opt}"
            ;;

        --curl-options)
            shift
            CURL_OPTIONS="${CURL_OPTIONS} ${1}"
            ;;

        --dry-run)
            DRY_RUN="true"
            ;;

        --output=*)
            opt=$(printf "%s\n" "${1}" | sed 's/^--output=//')
            HAS_USER_SET_OUTPUT="true"
            OUTPUT_PATH="${opt}"
            ;;

        -o|-O|--output)
            shift
            HAS_USER_SET_OUTPUT="true"
            OUTPUT_PATH="${1}"
            ;;

        -o*|-O*)
            opt=$(printf "%s\n" "${1}" | sed 's/^-[oO]//')
            HAS_USER_SET_OUTPUT="true"
            OUTPUT_PATH="${opt}"
            ;;

        --no-decode-filename)
            DECODE_FILENAME="false"
            ;;

        -h|--help)
            usage
            exit 0
            ;;

        -V|--version)
            print_version
            exit 0
            ;;

        --)
            # This is the start of the list of URLs.
            shift
            for url in "$@"; do
                # Encode whitespaces into %20, since wget supports those URLs.
                newurl=$(printf "%s\n" "${url}" | sed 's/ /%20/g')
                URLS="${URLS} ${newurl}"
            done
            break
            ;;

        -*)
            error "Unknown option: '$1'."
            ;;

        *)
            # This must be a URL.
            # Encode whitespaces into %20, since wget supports those URLs.
            newurl=$(printf "%s\n" "${1}" | sed 's/ /%20/g')
            URLS="${URLS} ${newurl}"
            ;;
    esac
    shift
done

sanitize
exec_curl