Got Youtube Music scraper working again.

editsong
Sander Vocke 5 years ago
parent af5a742d7c
commit 600a8ced5d
  1. 13
      .vscode/launch.json
  2. 5
      client/src/components/windows/settings/IntegrationSettings.tsx
  3. 132
      client/src/lib/integration/youtubemusic/YoutubeMusicWebScraper.tsx
  4. 6
      server/integrations/integrations.ts

@ -19,6 +19,19 @@
"console": "integratedTerminal", "console": "integratedTerminal",
"cwd": "${workspaceFolder}/server", "cwd": "${workspaceFolder}/server",
"internalConsoleOptions": "neverOpen" "internalConsoleOptions": "neverOpen"
},
{
"type": "node",
"request": "launch",
"name": "Development Server with SQLite @ dev.sqlite3",
"env": {
"API": "/api",
},
"program": "node_modules/.bin/nodemon",
"args": [ "server.ts" ],
"console": "integratedTerminal",
"cwd": "${workspaceFolder}/server",
"internalConsoleOptions": "neverOpen"
} }
] ]
} }

@ -195,6 +195,11 @@ let EditorWithTest = (props: any) => {
<Alert severity="success">Integration is active.</Alert> <Alert severity="success">Integration is active.</Alert>
) )
}) })
.catch((e: any) => {
setTestFlashMessage(
<Alert severity="error">Failed test: {e.message}</Alert>
)
})
}} }}
flashMessage={testFlashMessage} flashMessage={testFlashMessage}
showTestButton={true} showTestButton={true}

@ -2,6 +2,8 @@ import React from 'react';
import Integration, { IntegrationFeature, IntegrationAlbum, IntegrationArtist, IntegrationTrack } from '../Integration'; import Integration, { IntegrationFeature, IntegrationAlbum, IntegrationArtist, IntegrationTrack } from '../Integration';
import StoreLinkIcon from '../../../components/common/StoreLinkIcon'; import StoreLinkIcon from '../../../components/common/StoreLinkIcon';
import { IntegrationWith } from '../../../api/api'; import { IntegrationWith } from '../../../api/api';
import { runInNewContext } from 'vm';
import { TextRotateVertical } from '@material-ui/icons';
enum SearchType { enum SearchType {
Track = 'track', Track = 'track',
@ -19,82 +21,134 @@ export function extractInitialData(text: string): any | undefined {
// }); // });
// //
// the THIS part. // the THIS part.
//
// Another variant was found in the field, where there was also additional encoding involved:
//
// initialData.push({
// path: '\/search',
// params: JSON.parse('\x7b\x22query\x22:\x22something\x22\x7d')
// data: 'THIS2'
// })
// , where THIS2 was a string which also contained escape characters like \x7b and \x22.
// Get the whole line containing the data part. // Handle the 1st case.
let pattern = /initialData\.push\({[\n\r\s]*path:.*[\n\r\s]+params:\s*{\s*['"]query['"].*[\n\r\s]+data:\s*['"](.*)['"]\s*[\n\r]/ let pattern = /initialData\.push\({[\n\r\s]*path:.*[\n\r\s]+params:\s*{\s*['"]query['"].*[\n\r\s]+data:\s*['"](.*)['"]\s*[\n\r]/
let m = text.match(pattern); let m = text.match(pattern);
let dataline = Array.isArray(m) && m.length >= 2 ? m[1] : undefined; let dataline1 = Array.isArray(m) && m.length >= 2 ? m[1] : undefined;
if (!dataline) { return undefined; }
// Now parse the data line. // Now parse the data line.
let dataline_clean = dataline.replace(/\\"/g, '"').replace(/\\\\"/g, '\\"') let dataline1_clean = dataline1 ? dataline1.replace(/\\"/g, '"').replace(/\\\\"/g, '\\"') : undefined;
let json1 = dataline1_clean ? JSON.parse(dataline1_clean) : undefined;
// Handle the 2nd case.
let m2 = text.match(/params:[\s]*JSON\.parse\('([^']*)'\),[\n\r\s]*data:[\s]*'([^']*)'/g);
let json2: any = undefined;
if (Array.isArray(m2)) {
m2.forEach((match: string) => {
let decode = (s: string) => {
var r = /\\x([\d\w]{2})/gi;
let res = s.replace(r, function (match, grp) {
return String.fromCharCode(parseInt(grp, 16));
});
return unescape(res);
}
let paramsline: string = decode((match.match(/params:[\s]*JSON\.parse\('([^']*)'/) as string[])[1]);
if (!('query' in JSON.parse(paramsline))) {
return;
}
let dataline2: string = decode((match.match(/data:[\s]*'([^']*)'/) as string[])[1]);
json2 = JSON.parse(dataline2);
})
}
let json = JSON.parse(dataline_clean); // Return either one that worked.
return json; let result = json1 || json2;
console.log("initial data:", result);
return result;
} }
export function parseTracks(initialData: any): IntegrationTrack[] { export function parseTracks(initialData: any): IntegrationTrack[] {
try { try {
var musicResponsiveListItemRenderers: any[] = []; var musicResponsiveListItemRenderers: any[] = [];
// Scrape for any "Track"-type items. // Scrape for any "Song"-type items.
initialData.contents.sectionListRenderer.contents.forEach((c: any) => { initialData.contents.sectionListRenderer.contents.forEach((c: any) => {
if (c.musicShelfRenderer) { if (c.musicShelfRenderer) {
c.musicShelfRenderer.contents.forEach((cc: any) => { c.musicShelfRenderer.contents.forEach((cc: any) => {
if (cc.musicResponsiveListItemRenderer && if (cc.musicResponsiveListItemRenderer &&
cc.musicResponsiveListItemRenderer.flexColumns && cc.musicResponsiveListItemRenderer.flexColumns &&
cc.musicResponsiveListItemRenderer.flexColumns[1] cc.musicResponsiveListItemRenderer.flexColumns[1]
.musicResponsiveListItemFlexColumnRenderer.text.runs[0].text === "Track") { .musicResponsiveListItemFlexColumnRenderer.text.runs[0].text === "Song") {
musicResponsiveListItemRenderers.push(cc.musicResponsiveListItemRenderer); musicResponsiveListItemRenderers.push(cc.musicResponsiveListItemRenderer);
} }
}) })
} }
}) })
return musicResponsiveListItemRenderers.map((s: any) => { console.log("Found song itemrenderers:", musicResponsiveListItemRenderers);
let videoId = s.doubleTapCommand.watchEndpoint.videoId;
let columns = s.flexColumns;
if (columns[1].musicResponsiveListItemFlexColumnRenderer.text.runs[0].text !== "Track") {
throw new Error('song item doesnt match scraper expectation');
}
let title = columns[0].musicResponsiveListItemFlexColumnRenderer.text.runs[0].text;
let artists = columns[2].musicResponsiveListItemFlexColumnRenderer.text.runs.filter((run: any) => {
return 'navigationEndpoint' in run;
}).map((run: any) => {
let id = run.navigationEndpoint.browseEndpoint.browseId;
return {
url: `https://music.youtube.com/browse/${id}`,
name: run.text,
}
});
let albums = columns[3].musicResponsiveListItemFlexColumnRenderer.text.runs.filter((run: any) => { return musicResponsiveListItemRenderers.map((s: any) => {
return 'navigationEndpoint' in run; // There are some options that were encountered in the field.
}).map((run: any) => { // let videoId: string | undefined = undefined;
let id = run.navigationEndpoint.browseEndpoint.browseId; // if('doubleTapCommand' in s) s = s || s.doubleTapCommand.watchEndpoint.videoId;
return { // if('playlistItemData' in s) s = s || s.playlistItemData.videoId;
url: `https://music.youtube.com/browse/${id}`,
name: run.text, let runs: any[] = [];
artist: artists[0], // Gather all 'runs' fields together from all columns.
s.flexColumns.forEach((column: any) => {
runs.push(...column.musicResponsiveListItemFlexColumnRenderer.text.runs);
})
// Find the runs that hold the title, artist or album.
let title: string | undefined = undefined;
let album: IntegrationAlbum = {};
let artist: IntegrationArtist = {};
let videoId: string | undefined = undefined;
runs.forEach((run: any) => {
if ('navigationEndpoint' in run &&
'watchEndpoint' in run.navigationEndpoint &&
'videoId' in run.navigationEndpoint.watchEndpoint) {
videoId = run.navigationEndpoint.watchEndpoint.videoId;
title = run.text;
} else if ('navigationEndpoint' in run &&
'browseEndpoint' in run.navigationEndpoint &&
'browseEndpointContextSupportedConfigs' in run.navigationEndpoint.browseEndpoint &&
'browseEndpointContextMusicConfig' in run.navigationEndpoint.browseEndpoint.browseEndpointContextSupportedConfigs &&
'pageType' in run.navigationEndpoint.browseEndpoint.browseEndpointContextSupportedConfigs.browseEndpointContextMusicConfig &&
run.navigationEndpoint.browseEndpoint.browseEndpointContextSupportedConfigs.browseEndpointContextMusicConfig.pageType === 'MUSIC_PAGE_TYPE_ALBUM') {
album = {
url: `https://music.youtube.com/browse/${run.navigationEndpoint.browseEndpoint.browseId}`,
name: run.text,
}
} else if ('navigationEndpoint' in run &&
'browseEndpoint' in run.navigationEndpoint &&
'browseEndpointContextSupportedConfigs' in run.navigationEndpoint.browseEndpoint &&
'browseEndpointContextMusicConfig' in run.navigationEndpoint.browseEndpoint.browseEndpointContextSupportedConfigs &&
'pageType' in run.navigationEndpoint.browseEndpoint.browseEndpointContextSupportedConfigs.browseEndpointContextMusicConfig &&
run.navigationEndpoint.browseEndpoint.browseEndpointContextSupportedConfigs.browseEndpointContextMusicConfig.pageType === 'MUSIC_PAGE_TYPE_ARTIST') {
artist = {
url: `https://music.youtube.com/browse/${run.navigationEndpoint.browseEndpoint.browseId}`,
name: run.text,
}
} }
}); });
if(album.name && artist.name) {
album.artist = artist;
}
return { return {
title: title, title: title,
url: `https://music.youtube.com/watch?v=${videoId}`, url: `https://music.youtube.com/watch?v=${videoId}`,
artist: artists[0], artist: artist,
album: albums[0], album: album,
} }
}) })
} catch (e) { } catch (e) {
console.log("Error parsing songs:", e.message); console.log("Error parsing tracks:", e.message);
return []; return [];
} }
} }
export function parseArtists(initialData: any): IntegrationTrack[] { export function parseArtists(initialData: any): IntegrationArtist[] {
try { try {
var musicResponsiveListItemRenderers: any[] = []; var musicResponsiveListItemRenderers: any[] = [];

@ -45,6 +45,9 @@ export function createIntegrations(knex: Knex, apiBaseUrl: string) {
let replaced = path.replace(new RegExp(`${apiBaseUrl}/integrations/[0-9]+/`), ''); let replaced = path.replace(new RegExp(`${apiBaseUrl}/integrations/[0-9]+/`), '');
console.log("Rewrite URL:", path, replaced); console.log("Rewrite URL:", path, replaced);
return replaced; return replaced;
},
onProxyReq: (proxyReq: any, req: any, res: any) => {
console.log('--> ', req.method, req.path, '->', proxyReq.path);
} }
}); });
@ -57,6 +60,9 @@ export function createIntegrations(knex: Knex, apiBaseUrl: string) {
let replaced = path.replace(new RegExp(`${apiBaseUrl}/integrations/[0-9]+/`), ''); let replaced = path.replace(new RegExp(`${apiBaseUrl}/integrations/[0-9]+/`), '');
console.log("Rewrite URL:", path, replaced); console.log("Rewrite URL:", path, replaced);
return replaced; return replaced;
},
onProxyReq: (proxyReq: any, req: any, res: any) => {
console.log('--> ', req.method, req.path, '->', proxyReq.path);
} }
}) })

Loading…
Cancel
Save