![[personal profile]](https://www.dreamwidth.org/img/silk/identity/user.png)
This is how I'm converting Livejournal/Dreamwidth CSV exports to HTML using PHP.
Could it be better? Sure. Please make it better. In its current form, it can convert 20 years of data in about 5 seconds. You download your data on your own, then run this script against the directory.
I tried using XML files, but the entries were missing breaks, meaning that the converted entries turned into undifferentiated blocks of text. Rather than continue development for a worse result, I abandoned that approach.
//--------------------------------------------------------
Could it be better? Sure. Please make it better. In its current form, it can convert 20 years of data in about 5 seconds. You download your data on your own, then run this script against the directory.
I tried using XML files, but the entries were missing breaks, meaning that the converted entries turned into undifferentiated blocks of text. Rather than continue development for a worse result, I abandoned that approach.
<?PHP
//--------------------------------------------------------
// Global Variables - destined for INI file
//--------------------------------------------------------
$_INI['JournalBasePath'] = '/Documents/Dreamwidth';
$_INI['JournalCSVPath'] = $_INI['JournalBasePath'] . '/CSV';
$_INI['JournalHTMLPath'] = $_INI['JournalBasePath'] . '/HTMLPHP';
$_INI['JournalID'] = 'yourid';
$_INI['JournalSite'] = 'dreamwidth.org';
$_INI['JournalBaseURL'] = 'https://' . $_INI['JournalID'] . '.' . $_INI['JournalSite'];
$_INI['JournalDateFormat'] = 'l, F d, Y h:i A';
//--------------------------------------------------------
// Retrieve CSV File List
//--------------------------------------------------------
$CSV_List = scandir($_INI['JournalCSVPath'], SCANDIR_SORT_ASCENDING);
//--------------------------------------------------------
// Process the File List
//--------------------------------------------------------
foreach ($CSV_List as $CSV) {
If (preg_match('/.csv/', $CSV) ) {
Convert_CSV($CSV, $_INI);
}
}
//--------------------------------------------------------
// Function - Convert the CSV into an HTLM Document
//--------------------------------------------------------
Function Convert_CSV ($CSV, $_INI) {
//
// Set Input and Output Files
//
$CSV_In = $_INI['JournalCSVPath'] . '/' . $CSV;
$HTML_Out = $_INI['JournalHTMLPath'] . '/' . preg_replace('/.csv/', '.html', $CSV);
echo $CSV_In . "\n";
//
// Iterate Through Each File
//
if (($handle = fopen($CSV_In, "r")) !== FALSE) {
$Post = fgetcsv($handle, 0, ",");
//
// Define Header
//
$HTML_Header = '
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title>' . str_replace('.csv','',$CSV) . '</title>
</head>
<body>
';
//
// Define Footer
//
$HTML_Footer = '
</body>
</html>
';
//
// Write HEADER to FILE
//
file_put_contents($HTML_Out, $HTML_Header, LOCK_EX);
//
// Write H1 to FILE
//
$H1 = '<h1>' . str_replace('.csv','',$CSV) . '</h1>' . "\n";
file_put_contents($HTML_Out, $H1, FILE_APPEND | LOCK_EX);
//
// Iterate throught the CSV
// Write each Row to FILE
//
while (($Post = fgetcsv($handle, 0, ",")) !== FALSE) {
$HTML = Convert_Entry($Post, $_INI);
file_put_contents($HTML_Out, $HTML, FILE_APPEND | LOCK_EX);
}
//
// Write the Footert to FILE
//
file_put_contents($HTML_Out, $HTML_Footer, FILE_APPEND | LOCK_EX);
fclose($handle);
}
}
//--------------------------------------------------------
// Function - Convert an entry into HTML
//--------------------------------------------------------
Function Convert_Entry ($Post, $_INI) {
/*
0 = itemid
1 = eventtime
2 = logtime
3 = subject
4 = event (body of post)
5 = security
6 = allowmask
7 = current_music
8 = current_mood
*/
//
// ID
//
$ItemID = '<p><strong>ItemID:</strong> ' . $Post[0] . '</P>' . "\n";
//
// Date
//
$Date = strtotime($Post[1]);
$Formatted_Date = date($_INI['JournalDateFormat'], $Date) . "\n";
$EventTime = '<p><strong>Date:</strong> ' . $Formatted_Date . '</P>' . "\n";
//
// Entry TITLE
//
If (strlen($Post[3]) == 0) {
$Post[3] = $Post[1];
}
$Subject = '<h2>' . $Post[3] . '</h2>' . "\n";
//
// Body of Post
//
$Body = $Post[4] . "\n";
//
// Security
//
$Security = '<p><strong>Security:</strong> ' . $Post[5] . '</P>' . "\n";
//
// URL to Original Post
//
$URL = $_INI['JournalBaseURL'] . '/' . $Post[0] . '.html';
$HREF = '<a href="' . $URL . '">' . $URL . '</a>';
$WebLink = '<p><strong>Entry:</strong> ' . $HREF . '<p>' . "\n";
//
// Current Music (if present)
//
If (strlen($Post[7]) > 0) {
$CurrentMusic = '<p><strong>Current Music: </strong> '. $Post[7] . '</p>' . "\n";
} else {
$CurrentMusic = NULL;
}
//
// Current Mood (if present)
//
If (strlen($Post[8]) > 0) {
$CurrentMood = '<p><strong>Current Mood: </strong> ' . $Post[8] . '</p>' . "\n";
} else {
$CurrentMood = NULL;
}
//
// Tweak the formatting to favor word processors.
// Verified against LibreOffice Writer
//
// Sometimes there are newlines and no breaks. Early cutting and pasting was unpredictable.
$Body = str_replace("\n", "<br />\n", $Body);
// Change line breaks into paragraph breaks. Word processors need this to format paragraphs correctly.
$Body = str_replace('<br />', '</p>', $Body);
$Body = str_replace('<br>', '</p>', $Body);
// Non-breaking space don't play nice with word processors
$Body = str_replace(' ', ' ', $Body);
// These tags won't display because they aren't HTML tags
// <user site="livejournal.com" user="somebody">
//
$Body = preg_replace('/<user site=".*?" user="(.*?)">/','<u>${1}</u>', $Body);
//
// Removing Centering and Fonts Size values, both of which could get stuck on
// It would be better to close these things, but that would take work.
//
//
// Unmatched center tags cause run-on issues.
//
If (preg_match('/<center>/', $Body)) {
If (! preg_match ('/<\/center>/', $Body) ) {
$Body = preg_replace('/<center>/', '', $Body);
}
}
//
// Fixed fonts size cause problems in word processors
//
If (preg_match('/<font size=/', $Body)) {
$fontsize = '/<font size.*?>/';
$Body = preg_replace($fontsize,'', $Body);
}
//
// Return the Entry
//
return $Subject . $EventTime . $ItemID . $CurrentMood . $CurrentMusic . $WebLink . $Body;
}
?>