...
@tf.function
def train(t_train, s_train, t_phys):
    # Data loss
    
    # predict displacement
    s_train_hat = net.predict(t_train)
    # MSE loss between training data and predictions
    data_loss = tf.math.reduce_mean(
        tf.math.square(s_train - s_train_hat)
    )
    # Physics loss
    
    # predict displacement
    s_phys_hat = net.predict(t_phys)
    # split into individual x and y components
    s_x = s_phys_hat[:, 0]
    s_y = s_phys_hat[:, 1]
    # take the gradients to get predicted velocity and acceleration
    v_x = tf.gradients(s_x, t_phys)[0]
    v_y = tf.gradients(s_y, t_phys)[0]
    a_x = tf.gradients(v_x, t_phys)[0]
    a_y = tf.gradients(v_y, t_phys)[0]
    # combine individual x and y components into velocity and
    # acceleration vectors
    v = tf.concat([v_x, v_y], axis=1)
    a = tf.concat([a_x, a_y], axis=1)
    # as acceleration is the known equation, this is what we want to
    # perform gradient descent on.
    # therefore, prevent any gradients flowing through the higher
    # order (velocity) terms
    v = tf.stop_gradient(v)
    # define speed (velocity norm, the ||v|| in the equation) and
    # gravity vector for physics equation
    speed = tf.norm(v, axis=1, keepdims=True)
    g = [[0.0, 9.81]]
    # MSE between known physics equation and network gradients
    phys_loss = tf.math.reduce_mean(
        tf.math.square(-mu * speed * v - g - a)
    )
    # Total loss
    
    loss = data_weight * data_loss + phys_weight * phys_loss
    
    # Gradient step
    
    # minimise the combined loss with respect to both the neural
    # network parameters and the unknown physics variable, mu
    gradients = tf.gradients(loss, net.train_vars + [mu])
    optimiser.apply_gradients(zip(gradients, net.train_vars + [mu]))
...