#!/bin/bash # # Specialized archive downloader for mailman v3.x # # Features # # * Download all gz and html and attachments # * Filter: skip_gz, skip_attachment, skip_list, skip_email # * Doesn't download twice a same file # * Stores files for a list in its own folder # # Usage # # * Install gawk and curl # * Run this script # * Answer questions # * Mailman URL: the "root" URL for the mailman software. No CGI page, no trailing slash # * List name: the list ID, i.e. what appears after xxx.cgi/{list_name} in mailman URL # * Admin password: the admin password for the list # # Flags and options # # You can pass flags and options by using the export command. For example # If you want to skip gz files and attachments # export skip_gz=1 # If you want to provide the url, name, and password with no questions asked # export url=https://mailman.example.com # export name=list-example.com # export password=thisbemysecurepassword # # Notes # # * When gz is mentioned, they aren't actual compressed files... Beats me. # * This script could use parallel. It's quite long and tedious. # # @author alban # @since 2020-02-20 # Helper function: cURL wrapper with cookies and target file management http(){ local cookie="${1}"; local url="${2}"; local file="${3}"; [[ -s "$file" ]] && return local dir=$(dirname $file) [[ ! -d "$dir" ]] && mkdir -p "$dir" curl -s -H "Cookie: $cookie" "$url" -o $file && echo $file } # Helper function: exit + message panic(){echo "$@"; exit 1; } # Some dependencies are mandatory which gawk &>/dev/null || panic "Please install gawk" which curl &>/dev/null || panic "Please install curl" # Read user provided options [ -z "$url" ]] && read -p "Mailman URL (ex: https://list.example.com): " url [ -z "$nom" ]] && read -p "List name (ex: list-example.com): " nom [ -z "$password" ]] && read -p "Mot de passe admin: " password # get the cookie cookie_file=$(mktemp) curl -D ${cookie_file} "${url}/admindb.cgi/$nom" -d admlogin=whatever -d adminpw=${password} -o /dev/null -s cookie=$(grep Set-Cookie ${cookie_file} |gawk "match(\$0, /Set-Cookie: ([^;]*);.*/, a) {print a[1]}" -) rm -f ${cookie_file} # Get the main html response_file="$nom/index.html" http "$cookie" "${url}/private.cgi/${nom}" "$response_file" "" echo "export response_file='$response_file'" # Download the GZ files and attachments prefix="${url}/private.cgi/${nom}" if [[ -z "$skip_gz" ]] ; then # href="2009-June.txt.gz">[ Text 2 KB Gzip�s] gawk "match(\$0, /.*([0-9]{4}-.*txt.gz)/,a) {print a[1]}" "$response_file"| while read gz; do http "$cookie" "$url/private.cgi/${nom}/$gz" "$nom/$gz" # Download attachments if [[ -z "$skip_attachment" ]] ; then # URL: cat "$nom/$gz" | gawk 'match($0, /URL: <(.*?)>/,a) {print a[1]}' | while read attachment_url ; do output_file="$nom/${attachment_url/$prefix/}" http "$cookie" "$attachment_url" "$output_file" done fi done fi # Download the HTML Files, lists ordered by date / thread / etc. first if [[ -z "$skip_list" ]] ; then # return a list of 2008-March/.html gawk 'match($0, /href="(.*html)"/, a) {print a[1]}' "$response_file"|while read list_url; do list_dir=$(dirname $list_url) http "$cookie" "$url/private.cgi/$nom/$list_url" "$nom/$list_url" # Download single emails if [[ -z "$skip_email" ]] ; then #
  • [List] Regardez la lumière mes jolis cat "$nom/$list_url" | gawk 'match($0, /(href|HREF)="([0-9]+.html)"/,a) {print a[2]}' | while read email_url ; do output_file="$nom/${list_dir}/${email_url}" http "$cookie" "$url/private.cgi/$nom/$list_dir/$email_url" "$output_file" done fi done fi