mailman_downloader/mailman-downloader.sh

#!/bin/bash
#
# Specialized archive downloader for mailman v3.x
#
# Features
#
# * Download all gz and html and attachments
# * Filter: skip_gz, skip_attachment, skip_list, skip_email
# * Doesn't download twice a same file
# * Stores files for a list in its own folder
#
# Usage
#
# * Install gawk and curl
# * Run this script
# * Answer questions
#   * Mailman URL: the "root" URL for the mailman software. No CGI page, no trailing slash
#   * List name: the list ID, i.e. what appears after xxx.cgi/{list_name} in mailman URL
#   * Admin password: the admin password for the list
#
# Flags and options
#
# You can pass flags and options by using the export command. For example
# If you want to skip gz files and attachments
# export skip_gz=1
# If you want to provide the url, name, and password with no questions asked
# export url=https://mailman.example.com
# export name=list-example.com
# export password=thisbemysecurepassword
#
# Notes
#
# * When gz is mentioned, they aren't actual compressed files... Beats me.
# * This script could use parallel. It's quite long and tedious.
#
# @author alban
# @since 2020-02-20


# Helper function: cURL wrapper with cookies and target file management
http(){
    local cookie="${1}";
    local url="${2}";
    local file="${3}";
    [[ -s "$file" ]] && return
    local dir=$(dirname $file)
    [[ ! -d "$dir" ]] && mkdir -p "$dir"
    curl -s -H "Cookie: $cookie" "$url" -o $file && echo $file
}

# Helper function: exit + message
panic(){echo "$@"; exit 1; }

# Some dependencies are mandatory
which gawk &>/dev/null || panic "Please install gawk"
which curl &>/dev/null || panic "Please install curl"

# Read user provided options
[ -z "$url" ]] && read -p "Mailman URL (ex: https://list.example.com): " url
[ -z "$nom" ]] && read -p "List name (ex: list-example.com): " nom
[ -z "$password" ]] && read -p "Mot de passe admin: " password

# get the cookie
cookie_file=$(mktemp)
curl -D ${cookie_file} "${url}/admindb.cgi/$nom" -d admlogin=whatever -d adminpw=${password} -o /dev/null -s
cookie=$(grep Set-Cookie ${cookie_file} |gawk "match(\$0, /Set-Cookie: ([^;]*);.*/, a) {print a[1]}" -)
rm -f ${cookie_file}

# Get the main html
response_file="$nom/index.html"
http "$cookie" "${url}/private.cgi/${nom}" "$response_file" ""
echo "export response_file='$response_file'"

# Download the GZ files and attachments
prefix="${url}/private.cgi/${nom}"
if [[ -z "$skip_gz" ]] ; then
  # href="2009-June.txt.gz">[ Text 2 KB  Gzip<69>s]</a></td>
  gawk "match(\$0, /.*([0-9]{4}-.*txt.gz)/,a) {print a[1]}" "$response_file"| while read gz; do
    http "$cookie" "$url/private.cgi/${nom}/$gz" "$nom/$gz"

    # Download attachments
    if [[ -z "$skip_attachment" ]] ; then
        # URL: <http://lists.domain.tld/private.cgi/list-domain.tld/attachments/20190503/225f91a0/attachment.html>
        cat "$nom/$gz" | gawk 'match($0, /URL: <(.*?)>/,a) {print a[1]}' | while read attachment_url ; do
          output_file="$nom/${attachment_url/$prefix/}"
          http "$cookie" "$attachment_url" "$output_file"
        done
    fi
  done
fi

# Download the HTML Files, lists ordered by date / thread / etc. first
if [[ -z "$skip_list" ]] ; then

  # return a list of 2008-March/<type>.html
  gawk 'match($0, /href="(.*html)"/, a) {print a[1]}' "$response_file"|while read list_url; do
    list_dir=$(dirname $list_url)
    http "$cookie" "$url/private.cgi/$nom/$list_url" "$nom/$list_url"

    # Download single emails
    if [[ -z "$skip_email" ]] ; then
        # <LI><A HREF="020532.html">[List] Regardez la lumi&#232;re mes jolis
        cat "$nom/$list_url" | gawk 'match($0, /(href|HREF)="([0-9]+.html)"/,a) {print a[2]}' | while read email_url ; do
          output_file="$nom/${list_dir}/${email_url}"
          http "$cookie" "$url/private.cgi/$nom/$list_dir/$email_url" "$output_file"
        done
    fi
  done

fi