111 lines
3.8 KiB
Bash
111 lines
3.8 KiB
Bash
|
#!/bin/bash
|
|||
|
#
|
|||
|
# Specialized archive downloader for mailman v3.x
|
|||
|
#
|
|||
|
# Features
|
|||
|
#
|
|||
|
# * Download all gz and html and attachments
|
|||
|
# * Filter: skip_gz, skip_attachment, skip_list, skip_email
|
|||
|
# * Doesn't download twice a same file
|
|||
|
# * Stores files for a list in its own folder
|
|||
|
#
|
|||
|
# Usage
|
|||
|
#
|
|||
|
# * Install gawk and curl
|
|||
|
# * Run this script
|
|||
|
# * Answer questions
|
|||
|
# * Mailman URL: the "root" URL for the mailman software. No CGI page, no trailing slash
|
|||
|
# * List name: the list ID, i.e. what appears after xxx.cgi/{list_name} in mailman URL
|
|||
|
# * Admin password: the admin password for the list
|
|||
|
#
|
|||
|
# Flags and options
|
|||
|
#
|
|||
|
# You can pass flags and options by using the export command. For example
|
|||
|
# If you want to skip gz files and attachments
|
|||
|
# export skip_gz=1
|
|||
|
# If you want to provide the url, name, and password with no questions asked
|
|||
|
# export url=https://mailman.example.com
|
|||
|
# export name=list-example.com
|
|||
|
# export password=thisbemysecurepassword
|
|||
|
#
|
|||
|
# Notes
|
|||
|
#
|
|||
|
# * When gz is mentioned, they aren't actual compressed files... Beats me.
|
|||
|
# * This script could use parallel. It's quite long and tedious.
|
|||
|
#
|
|||
|
# @author alban
|
|||
|
# @since 2020-02-20
|
|||
|
|
|||
|
|
|||
|
# Helper function: cURL wrapper with cookies and target file management
|
|||
|
http(){
|
|||
|
local cookie="${1}";
|
|||
|
local url="${2}";
|
|||
|
local file="${3}";
|
|||
|
[[ -s "$file" ]] && return
|
|||
|
local dir=$(dirname $file)
|
|||
|
[[ ! -d "$dir" ]] && mkdir -p "$dir"
|
|||
|
curl -s -H "Cookie: $cookie" "$url" -o $file && echo $file
|
|||
|
}
|
|||
|
|
|||
|
# Helper function: exit + message
|
|||
|
panic(){echo "$@"; exit 1; }
|
|||
|
|
|||
|
# Some dependencies are mandatory
|
|||
|
which gawk &>/dev/null || panic "Please install gawk"
|
|||
|
which curl &>/dev/null || panic "Please install curl"
|
|||
|
|
|||
|
# Read user provided options
|
|||
|
[ -z "$url" ]] && read -p "Mailman URL (ex: https://list.example.com): " url
|
|||
|
[ -z "$nom" ]] && read -p "List name (ex: list-example.com): " nom
|
|||
|
[ -z "$password" ]] && read -p "Mot de passe admin: " password
|
|||
|
|
|||
|
# get the cookie
|
|||
|
cookie_file=$(mktemp)
|
|||
|
curl -D ${cookie_file} "${url}/admindb.cgi/$nom" -d admlogin=whatever -d adminpw=${password} -o /dev/null -s
|
|||
|
cookie=$(grep Set-Cookie ${cookie_file} |gawk "match(\$0, /Set-Cookie: ([^;]*);.*/, a) {print a[1]}" -)
|
|||
|
rm -f ${cookie_file}
|
|||
|
|
|||
|
# Get the main html
|
|||
|
response_file="$nom/index.html"
|
|||
|
http "$cookie" "${url}/private.cgi/${nom}" "$response_file" ""
|
|||
|
echo "export response_file='$response_file'"
|
|||
|
|
|||
|
# Download the GZ files and attachments
|
|||
|
prefix="${url}/private.cgi/${nom}"
|
|||
|
if [[ -z "$skip_gz" ]] ; then
|
|||
|
# href="2009-June.txt.gz">[ Text 2 KB Gzip<69>s]</a></td>
|
|||
|
gawk "match(\$0, /.*([0-9]{4}-.*txt.gz)/,a) {print a[1]}" "$response_file"| while read gz; do
|
|||
|
http "$cookie" "$url/private.cgi/${nom}/$gz" "$nom/$gz"
|
|||
|
|
|||
|
# Download attachments
|
|||
|
if [[ -z "$skip_attachment" ]] ; then
|
|||
|
# URL: <http://lists.domain.tld/private.cgi/list-domain.tld/attachments/20190503/225f91a0/attachment.html>
|
|||
|
cat "$nom/$gz" | gawk 'match($0, /URL: <(.*?)>/,a) {print a[1]}' | while read attachment_url ; do
|
|||
|
output_file="$nom/${attachment_url/$prefix/}"
|
|||
|
http "$cookie" "$attachment_url" "$output_file"
|
|||
|
done
|
|||
|
fi
|
|||
|
done
|
|||
|
fi
|
|||
|
|
|||
|
# Download the HTML Files, lists ordered by date / thread / etc. first
|
|||
|
if [[ -z "$skip_list" ]] ; then
|
|||
|
|
|||
|
# return a list of 2008-March/<type>.html
|
|||
|
gawk 'match($0, /href="(.*html)"/, a) {print a[1]}' "$response_file"|while read list_url; do
|
|||
|
list_dir=$(dirname $list_url)
|
|||
|
http "$cookie" "$url/private.cgi/$nom/$list_url" "$nom/$list_url"
|
|||
|
|
|||
|
# Download single emails
|
|||
|
if [[ -z "$skip_email" ]] ; then
|
|||
|
# <LI><A HREF="020532.html">[List] Regardez la lumière mes jolis
|
|||
|
cat "$nom/$list_url" | gawk 'match($0, /(href|HREF)="([0-9]+.html)"/,a) {print a[2]}' | while read email_url ; do
|
|||
|
output_file="$nom/${list_dir}/${email_url}"
|
|||
|
http "$cookie" "$url/private.cgi/$nom/$list_dir/$email_url" "$output_file"
|
|||
|
done
|
|||
|
fi
|
|||
|
done
|
|||
|
|
|||
|
fi
|