mailman_downloader/mailman-downloader.sh

111 lines
3.8 KiB
Bash
Raw Permalink Normal View History

2020-02-26 10:47:17 +00:00
#!/bin/bash
#
# Specialized archive downloader for mailman v3.x
#
# Features
#
# * Download all gz and html and attachments
# * Filter: skip_gz, skip_attachment, skip_list, skip_email
# * Doesn't download twice a same file
# * Stores files for a list in its own folder
#
# Usage
#
# * Install gawk and curl
# * Run this script
# * Answer questions
# * Mailman URL: the "root" URL for the mailman software. No CGI page, no trailing slash
# * List name: the list ID, i.e. what appears after xxx.cgi/{list_name} in mailman URL
# * Admin password: the admin password for the list
#
# Flags and options
#
# You can pass flags and options by using the export command. For example
# If you want to skip gz files and attachments
# export skip_gz=1
# If you want to provide the url, name, and password with no questions asked
# export url=https://mailman.example.com
# export name=list-example.com
# export password=thisbemysecurepassword
#
# Notes
#
# * When gz is mentioned, they aren't actual compressed files... Beats me.
# * This script could use parallel. It's quite long and tedious.
#
# @author alban
# @since 2020-02-20
# Helper function: cURL wrapper with cookies and target file management
http(){
local cookie="${1}";
local url="${2}";
local file="${3}";
[[ -s "$file" ]] && return
local dir=$(dirname $file)
[[ ! -d "$dir" ]] && mkdir -p "$dir"
curl -s -H "Cookie: $cookie" "$url" -o $file && echo $file
}
# Helper function: exit + message
panic(){echo "$@"; exit 1; }
# Some dependencies are mandatory
which gawk &>/dev/null || panic "Please install gawk"
which curl &>/dev/null || panic "Please install curl"
# Read user provided options
[ -z "$url" ]] && read -p "Mailman URL (ex: https://list.example.com): " url
[ -z "$nom" ]] && read -p "List name (ex: list-example.com): " nom
[ -z "$password" ]] && read -p "Mot de passe admin: " password
# get the cookie
cookie_file=$(mktemp)
curl -D ${cookie_file} "${url}/admindb.cgi/$nom" -d admlogin=whatever -d adminpw=${password} -o /dev/null -s
cookie=$(grep Set-Cookie ${cookie_file} |gawk "match(\$0, /Set-Cookie: ([^;]*);.*/, a) {print a[1]}" -)
rm -f ${cookie_file}
# Get the main html
response_file="$nom/index.html"
http "$cookie" "${url}/private.cgi/${nom}" "$response_file" ""
echo "export response_file='$response_file'"
# Download the GZ files and attachments
prefix="${url}/private.cgi/${nom}"
if [[ -z "$skip_gz" ]] ; then
# href="2009-June.txt.gz">[ Text 2 KB Gzip<69>s]</a></td>
gawk "match(\$0, /.*([0-9]{4}-.*txt.gz)/,a) {print a[1]}" "$response_file"| while read gz; do
http "$cookie" "$url/private.cgi/${nom}/$gz" "$nom/$gz"
# Download attachments
if [[ -z "$skip_attachment" ]] ; then
# URL: <http://lists.domain.tld/private.cgi/list-domain.tld/attachments/20190503/225f91a0/attachment.html>
cat "$nom/$gz" | gawk 'match($0, /URL: <(.*?)>/,a) {print a[1]}' | while read attachment_url ; do
output_file="$nom/${attachment_url/$prefix/}"
http "$cookie" "$attachment_url" "$output_file"
done
fi
done
fi
# Download the HTML Files, lists ordered by date / thread / etc. first
if [[ -z "$skip_list" ]] ; then
# return a list of 2008-March/<type>.html
gawk 'match($0, /href="(.*html)"/, a) {print a[1]}' "$response_file"|while read list_url; do
list_dir=$(dirname $list_url)
http "$cookie" "$url/private.cgi/$nom/$list_url" "$nom/$list_url"
# Download single emails
if [[ -z "$skip_email" ]] ; then
# <LI><A HREF="020532.html">[List] Regardez la lumi&#232;re mes jolis
cat "$nom/$list_url" | gawk 'match($0, /(href|HREF)="([0-9]+.html)"/,a) {print a[2]}' | while read email_url ; do
output_file="$nom/${list_dir}/${email_url}"
http "$cookie" "$url/private.cgi/$nom/$list_dir/$email_url" "$output_file"
done
fi
done
fi